From 0b5303e833ed7e46e5aad4e1770f9bf4ef7d6c6b Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Wed, 27 Jan 2021 08:58:21 -0800 Subject: [PATCH 01/41] Propagate CreationMeta when chaining views (#51061) Summary: Fixes https://github.com/pytorch/pytorch/issues/49824 ## Background When creating a view of a view, there was a possibility that the new view would be less restrictive than the previous view, incorrectly sidestepping the error that should be thrown when using in-place operations on the new view. The fix addresses this by propagating `CreationMeta` from the previous view to the new view. Currently, the old view's `creation_meta` is only propagated when the new view's `creation_meta == CreationMeta::DEFAULT`. This ensures that the new view is not less restrictive than the previous view wrt. allowing in-place operations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/51061 Test Plan: ``` python test/test_autograd.py TestAutogradDeviceTypeCPU.test_inplace_view_of_multiple_output_view_cpu python test/test_autograd.py TestAutogradDeviceTypeCUDA.test_inplace_view_of_multiple_output_view_cuda python test/test_autograd.py TestAutogradDeviceTypeCPU.test_inplace_multiple_output_view_of_view_cpu python test/test_autograd.py TestAutogradDeviceTypeCUDA.test_inplace_multiple_output_view_of_view_cuda ``` Reviewed By: heitorschueroff Differential Revision: D26076434 Pulled By: jbschlosser fbshipit-source-id: c47f0ddcef9b8449427b671aff9ad08edca70fcd --- test/test_autograd.py | 14 ++++++++++++++ torch/csrc/autograd/VariableTypeUtils.h | 10 ++++++++++ torch/csrc/autograd/variable.h | 9 +++++++++ 3 files changed, 33 insertions(+) diff --git a/test/test_autograd.py b/test/test_autograd.py index 28ef8d41b346..df588f97701a 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -7377,6 +7377,20 @@ def test_inplace_view_multiple_outputs(self, device): with self.assertRaises(RuntimeError): v1[0].mul_(2) + def test_inplace_view_of_multiple_output_view(self, device): + a = torch.rand(10, device=device, requires_grad=True).clone() + b = a.unbind(0) + c = b[0].view_as(b[0]) + with self.assertRaises(RuntimeError): + c.mul_(2) + + def test_inplace_multiple_output_view_of_view(self, device): + a = torch.rand(10, device=device, requires_grad=True).clone() + b = a.view_as(a) + c = b.unbind(0) + with self.assertRaises(RuntimeError): + c[0].mul_(2) + def test_inplace_view_makes_base_require_grad(self, device): # in-place modification to view makes base require grad a = torch.randn(4, 4, device=device, requires_grad=False) diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index 2894a75fed69..85b83f2aa6ee 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -145,6 +145,7 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif if (base.is_view()) { auto diff_view_meta = static_cast(torch::autograd::impl::get_autograd_meta(base)); const auto& base_bw_info = diff_view_meta->get_backward_view(); + creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta); return make_variable_differentiable_view(tensor, base_bw_info.chain(base, tensor, view_func), c10::nullopt, creation_meta, allow_tensor_metadata_change); } else { @@ -188,6 +189,10 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif } if (is_fw_differentiable || is_bw_differentiable) { + if (base.is_view()) { + auto diff_view_meta = static_cast(torch::autograd::impl::get_autograd_meta(base)); + creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta); + } return make_variable_differentiable_view(tensor, std::move(new_bw_info), std::move(new_fw_info), creation_meta, allow_tensor_metadata_change); } else { @@ -234,6 +239,11 @@ inline std::vector as_view(const Tensor & base, std::vector& ten } } + if ((is_fw_differentiable || is_bw_differentiable) && base.is_view()) { + auto diff_view_meta = static_cast(torch::autograd::impl::get_autograd_meta(base)); + creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta); + } + for(Tensor &tensor : tensors) { if (is_fw_differentiable || is_bw_differentiable) { tensor = make_variable_differentiable_view(tensor, new_bw_info, new_fw_info, creation_meta); diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 9cdf40fe2c63..83bbc9406081 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -502,6 +502,15 @@ struct TORCH_API ViewInfo { enum class CreationMeta: uint8_t { DEFAULT, IN_CUSTOM_FUNCTION, MULTI_OUTPUT_NODE, NO_GRAD_MODE, MULTI_OUTPUT_SAFE }; +/// Handles correctly propagating CreationMeta when a new view is created from a previous view. +/// In general, we don't want the new view to be _less_ restrictive than the previous view +/// (it's okay to be _more_ restrictive). A CreationMeta value of DEFAULT is currently the least +/// restrictive, as the behavior for all other CreationMeta values is to error out for in-place ops. +/// If this changes, the logic here will need to be updated to properly handle the new semantics. +inline CreationMeta propagate_creation_meta(CreationMeta prev_view_creation_meta, CreationMeta new_view_creation_meta) { + return (new_view_creation_meta == CreationMeta::DEFAULT) ? prev_view_creation_meta : new_view_creation_meta; +} + /// Unified function to handle error checking when rebase happens /// indirect=true means that the caller is not doing the inplace, but the inplace happened /// somewhere else. From 4a2aa0f5f1572d8a9caca9496669f8f52cfa1522 Mon Sep 17 00:00:00 2001 From: anjali411 Date: Wed, 27 Jan 2021 09:09:09 -0800 Subject: [PATCH 02/41] index_put_ for complex tensors on CUDA (#51148) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51148 Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D26102025 Pulled By: anjali411 fbshipit-source-id: b1b6fd12fda03c4520a3c3200226edf352496188 --- aten/src/ATen/native/cuda/Indexing.cu | 2 +- test/test_indexing.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 035dc188c81c..6b3304cff421 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -230,7 +230,7 @@ void index_put_accum_kernel(Tensor & self, const c10::List std::min(std::max(1,nElemBefore), at::cuda::getCurrentDeviceProperties()->maxGridSize[2])); dim3 block(C10_WARP_SIZE, indices_per_block); - AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, value_.scalar_type(), "indexing_backward", [&] { indexing_backward_kernel<<>>( sorted_indices.data_ptr(), diff --git a/test/test_indexing.py b/test/test_indexing.py index b92fd94e8cbd..10e4a9bafe95 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -762,9 +762,9 @@ def test_int_indices(self, device): self.assertEqual(v[:, [0, 4, 2]].shape, (5, 3, 3)) self.assertEqual(v[:, [[0, 1], [4, 3]]].shape, (5, 2, 2, 3)) - @dtypes(torch.float, torch.bfloat16, torch.long, torch.bool) - @dtypesIfCPU(torch.float, torch.long, torch.bool, torch.bfloat16) - @dtypesIfCUDA(torch.half, torch.long, torch.bool, torch.bfloat16) + @dtypes(torch.cfloat, torch.cdouble, torch.float, torch.bfloat16, torch.long, torch.bool) + @dtypesIfCPU(torch.cfloat, torch.cdouble, torch.float, torch.long, torch.bool, torch.bfloat16) + @dtypesIfCUDA(torch.cfloat, torch.cdouble, torch.half, torch.long, torch.bool, torch.bfloat16) def test_index_put_src_datatype(self, device, dtype): src = torch.ones(3, 2, 4, device=device, dtype=dtype) vals = torch.ones(3, 2, 4, device=device, dtype=dtype) From 16dd5ca8abd926bed2f49365171821c33a749799 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Wed, 27 Jan 2021 10:29:41 -0800 Subject: [PATCH 03/41] Followup of kron PR (#51045) Summary: Followup of https://github.com/pytorch/pytorch/pull/50927 Pull Request resolved: https://github.com/pytorch/pytorch/pull/51045 Reviewed By: mruberry Differential Revision: D26089204 Pulled By: ngimel fbshipit-source-id: 77291dd83fba32d6f80a8540910b112a1d85a892 --- aten/src/ATen/native/LinearAlgebra.cpp | 22 ++++++++++++++-------- test/test_linalg.py | 12 ++++++++++++ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index a4b6e4df43f8..47ff70b93231 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -2223,17 +2223,17 @@ Tensor chain_matmul(TensorList matrices) { Calculates the Kronecker product between two Tensors. */ Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) { - auto maxdim = std::max(self.dim(), other.dim()); - auto pad_self = maxdim - self.dim(); - auto pad_other = maxdim - other.dim(); + int64_t maxdim = std::max(self.dim(), other.dim()); + int64_t pad_self = maxdim - self.dim(); + int64_t pad_other = maxdim - other.dim(); c10::SmallVector a_reshape(2 * maxdim); c10::SmallVector b_reshape(2 * maxdim); c10::SmallVector result_reshape(maxdim); - for (int i = 0; i < maxdim; i++) { - a_reshape[2 * i] = i >= pad_self ? self.sizes()[i - pad_self] : 1; + for (int64_t i = 0; i < maxdim; i++) { + a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1); a_reshape[2 * i + 1] = 1; b_reshape[2 * i] = 1; - b_reshape[2 * i + 1] = i >= pad_other ? other.sizes()[i - pad_other] : 1; + b_reshape[2 * i + 1] = (i >= pad_other ? other.sizes()[i - pad_other] : 1); result_reshape[i] = a_reshape[2 * i] * b_reshape[2 * i + 1]; } auto self_view = at::_unsafe_view(self, a_reshape); @@ -2241,8 +2241,14 @@ Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) { if (!result.defined()) { result = at::_unsafe_view(at::mul(self_view, other_view), result_reshape); } else { - at::mul_out(result, self_view, other_view); - result.resize_(result_reshape); + c10::SmallVector mul_shape(2 * maxdim); + for (int64_t i = 0; i < maxdim; i++) { + mul_shape[2 * i] = a_reshape[2 * i]; + mul_shape[2 * i + 1] = b_reshape[2 * i + 1]; + } + resize_output(result, result_reshape); + auto result_mul = at::_unsafe_view(result, mul_shape); + at::mul_out(result_mul, self_view, other_view); } return result; } diff --git a/test/test_linalg.py b/test/test_linalg.py index f2ee0dcaaef9..fd70ebaad04f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -828,6 +828,18 @@ def run_test_skipped_elements(a_shape, b_shape): # run_test_transposed(a_shape, b_shape) run_test_skipped_elements(a_shape, b_shape) + # Test that kron perserve memory format + a = torch.randn(1, 2, 3, 4, dtype=dtype, device=device).contiguous(memory_format=torch.channels_last) + b = torch.randn(1, 2, 3, 4, dtype=dtype, device=device).contiguous(memory_format=torch.channels_last) + c = torch.kron(a, b) + self.assertTrue(c.is_contiguous(memory_format=torch.channels_last)) + torch.kron(a, b, out=c) + self.assertTrue(c.is_contiguous(memory_format=torch.channels_last)) + c = c.contiguous(memory_format=torch.contiguous_format) + torch.kron(a, b, out=c) + self.assertTrue(c.is_contiguous(memory_format=torch.contiguous_format)) + + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_kron_empty(self, device, dtype): From e9ffad088f720b70dc6836cc8672b812560ededa Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Wed, 27 Jan 2021 10:32:35 -0800 Subject: [PATCH 04/41] numeric suite: add types to eager (#51168) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51168 Adds types to function I/O for numeric suite. This is for readability and static type checking with mypy. Test Plan: ``` mypy torch/quantization/ ``` Imported from OSS Reviewed By: jerryzh168 Differential Revision: D26092454 fbshipit-source-id: d37cf61e4d9604f4bc550b392f55fb59165f7624 --- torch/quantization/_numeric_suite.py | 68 +++++++++++++++++----------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/torch/quantization/_numeric_suite.py b/torch/quantization/_numeric_suite.py index 100ff54d4436..de0b4083b390 100644 --- a/torch/quantization/_numeric_suite.py +++ b/torch/quantization/_numeric_suite.py @@ -1,10 +1,9 @@ - import torch import torch.nn as nn import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd from torch.quantization import prepare -from typing import Dict +from typing import Dict, List, Optional, Any, Union, Callable, Set from .quantization_mappings import ( get_default_compare_output_module_list, @@ -18,7 +17,10 @@ } -def _find_match(str_list, key_str, postfix): +def _find_match( + str_list: Union[Dict[str, Any], List[str]], key_str: str, + postfix: str, +) -> Optional[str]: split_str = key_str.split(".") if split_str[-1] == postfix: match_string = "".join(key_str.split(".")[0:-1]) @@ -42,11 +44,14 @@ def _find_match(str_list, key_str, postfix): return s2 if match_string == pattern2: return s2 + return None else: return None -def compare_weights(float_dict, quantized_dict): +def compare_weights( + float_dict: Dict[str, Any], quantized_dict: Dict[str, Any] +) -> Dict[str, Dict[str, torch.Tensor]]: r"""Compare the weights of the float module with its corresponding quantized module. Return a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the float and @@ -105,7 +110,10 @@ def compare_weights(float_dict, quantized_dict): return weight_dict -def _get_logger_dict_helper(mod, target_dict, prefix=""): +def _get_logger_dict_helper( + mod: nn.Module, target_dict: Dict[str, Any], + prefix: str = "", +) -> None: r"""This is the helper function for get_logger_dict Args: @@ -127,7 +135,7 @@ def get_prefix(prefix): _get_logger_dict_helper(child, target_dict, module_prefix) -def get_logger_dict(mod, prefix=""): +def get_logger_dict(mod: nn.Module, prefix: str = "") -> Dict[str, Dict]: r"""Traverse the modules and save all logger stats into target dict. This is mainly used for quantization accuracy debug. @@ -195,11 +203,11 @@ def forward(self, x): return x -def _convert_tuple_to_list(t): +def _convert_tuple_to_list(t: Any) -> Any: return list(_convert_tuple_to_list(x) for x in t) if type(t) is tuple else t -def _dequantize_tensor_list(t): +def _dequantize_tensor_list(t: Any) -> Any: return ( list(_dequantize_tensor_list(x) for x in t) if type(t) is list @@ -228,7 +236,7 @@ def __init__(self, q_module, float_module, Logger): self.dequant = nnq.DeQuantize() self.logger = Logger() - def forward(self, *x): + def forward(self, *x) -> torch.Tensor: xl = _convert_tuple_to_list(x) output = self.orig_module(*xl) xl_float = _dequantize_tensor_list(xl) @@ -236,7 +244,7 @@ def forward(self, *x): self.logger(output, shadow_output) return output - def add(self, x, y): + def add(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: output = self.orig_module.add(x, y) x = x.dequantize() y = y.dequantize() @@ -244,14 +252,14 @@ def add(self, x, y): self.logger(output, shadow_output) return output - def add_scalar(self, x, y): + def add_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor: output = self.orig_module.add_scalar(x, y) x = x.dequantize() shadow_output = self.shadow_module.add_scalar(x, y) self.logger(output, shadow_output) return output - def mul(self, x, y): + def mul(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: output = self.orig_module.mul(x, y) x = x.dequantize() y = y.dequantize() @@ -259,21 +267,21 @@ def mul(self, x, y): self.logger(output, shadow_output) return output - def mul_scalar(self, x, y): + def mul_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor: output = self.orig_module.mul_scalar(x, y) x = x.dequantize() shadow_output = self.shadow_module.mul_scalar(x, y) self.logger(output, shadow_output) return output - def cat(self, x, dim=0): + def cat(self, x: List[torch.Tensor], dim: int = 0) -> torch.Tensor: output = self.orig_module.cat(x, dim) x = [y.dequantize() for y in x] shadow_output = self.shadow_module.cat(x, dim) self.logger(output, shadow_output) return output - def add_relu(self, x, y): + def add_relu(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: output = self.orig_module.add_relu(x, y) x = x.dequantize() y = y.dequantize() @@ -282,7 +290,10 @@ def add_relu(self, x, y): return output -def prepare_model_with_stubs(float_module, q_module, module_swap_list, Logger): +def prepare_model_with_stubs( + float_module: nn.Module, q_module: nn.Module, + module_swap_list: Set[type], Logger: Callable, +) -> None: r"""Prepare the model by attaching the float module to its matching quantized module as the shadow if the float module type is in module_swap_list. @@ -322,8 +333,9 @@ def prepare_model_with_stubs(float_module, q_module, module_swap_list, Logger): def compare_model_stub( - float_model, q_model, module_swap_list, *data, Logger=ShadowLogger -): + float_model: nn.Module, q_model: nn.Module, module_swap_list: Set[type], + *data, Logger=ShadowLogger +) -> Dict[str, Dict]: r"""Compare quantized module in a model with its floating point counterpart, feeding both of them the same input. Return a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and @@ -361,7 +373,9 @@ def compare_model_stub( return ob_dict -def get_matching_activations(float_module, q_module): +def get_matching_activations( + float_module: nn.Module, q_module: nn.Module, +) -> Dict[str, Dict[str, torch.Tensor]]: r"""Find the matching activation between float and quantized modules. Args: @@ -387,11 +401,11 @@ def get_matching_activations(float_module, q_module): def prepare_model_outputs( - float_module, - q_module, + float_module: nn.Module, + q_module: nn.Module, Logger=OutputLogger, allow_list=None -): +) -> None: r"""Prepare the model by attaching the logger to both float module and quantized module if they are in the allow_list. @@ -406,9 +420,9 @@ def prepare_model_outputs( allow_list = get_default_compare_output_module_list() qconfig_debug = torch.quantization.QConfig(activation=Logger, weight=None) - float_module.qconfig = qconfig_debug + float_module.qconfig = qconfig_debug # type: ignore prepare(float_module, inplace=True, allow_list=allow_list) - q_module.qconfig = qconfig_debug + q_module.qconfig = qconfig_debug # type: ignore prepare( q_module, inplace=True, @@ -418,12 +432,12 @@ def prepare_model_outputs( def compare_model_outputs( - float_model, - q_model, + float_model: nn.Module, + q_model: nn.Module, *data, Logger=OutputLogger, allow_list=None -): +) -> Dict[str, Dict[str, torch.Tensor]]: r"""Compare output activations between float and quantized models at corresponding locations for the same input. Return a dict with key corresponding to quantized module names and each entry being a dictionary with two keys From 9b6d463704d1dbac9948cde8d39e6790640a23eb Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 27 Jan 2021 10:34:56 -0800 Subject: [PATCH 05/41] Move std and var tests to OpInfos (#50901) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50901 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D26083289 Pulled By: mruberry fbshipit-source-id: 7e14ff37bba46dd456e0bc0aa9c4e0a632d0734c --- test/test_ops.py | 18 +++---- test/test_torch.py | 6 --- .../_internal/common_methods_invocations.py | 49 +++++++++++++++---- 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index bd82aca3820a..40627cd4b264 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -2,7 +2,6 @@ import torch -from torch.testing import floating_and_complex_types_and from torch.testing._internal.common_utils import \ (TestCase, run_tests, IS_SANDCASTLE, clone_input_helper) from torch.testing._internal.common_methods_invocations import \ @@ -191,7 +190,8 @@ def check_variant_backward(self, input, forward_result, expected_grad, expected_ # against eager's gold standard op function variant @ops(op_db) def test_variant_consistency_eager(self, device, dtype, op): - samples = op.sample_inputs(device, dtype, requires_grad=True) + test_backward = op.test_complex_grad or not dtype.is_complex + samples = op.sample_inputs(device, dtype, requires_grad=test_backward) if len(samples) == 0: self.skipTest("Skipped! No sample inputs!") @@ -237,7 +237,7 @@ def test_variant_consistency_eager(self, device, dtype, op): self.assertEqual(variant_forward, expected_forward) # Compares variant's backward - if variant is not inplace or op.test_inplace_grad: + if test_backward and (variant is not inplace or op.test_inplace_grad): self.check_variant_backward(sample.input, variant_forward, expected_grad, exception_during_backwards) @@ -247,7 +247,10 @@ def test_variant_consistency_eager(self, device, dtype, op): # TODO WARNING: inplace x {traced, scripted} not currently tested @ops(op_db) def test_variant_consistency_jit(self, device, dtype, op): - samples = op.sample_inputs(device, dtype, requires_grad=True) + test_backward = ( + (dtype.is_complex and op.test_complex_grad) or + (dtype.is_floating_point and (not op.skip_bfloat16_grad or dtype != torch.bfloat16))) + samples = op.sample_inputs(device, dtype, requires_grad=test_backward) if len(samples) == 0: self.skipTest("Skipped! No sample inputs!") @@ -279,9 +282,6 @@ def fn(*inputs, **kwargs): output = func(*inputs, **kwargs) return op.output_func(output) - # bfloat16 grad doesn't work for some operators - dtypes_to_grad_check = floating_and_complex_types_and(torch.half) \ - if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16) # Check scripted forward, grad, and grad grad script_fn = create_script_fn(self, name, func_type, op.output_func) @@ -291,7 +291,7 @@ def fn(*inputs, **kwargs): fn, (*sample.input,) + sample.args, sample.kwargs, - no_grad=(dtype not in dtypes_to_grad_check)) + no_grad=not test_backward) # Check traced forward, grad, and grad grad traced_fn = create_traced_fn(self, variant) @@ -300,7 +300,7 @@ def fn(*inputs, **kwargs): fn, (*sample.input,) + sample.args, sample.kwargs, - no_grad=(dtype not in dtypes_to_grad_check)) + no_grad=not test_backward) # Check alias annotation schema for correctness (make # sure inputs that aren't supposed to be modified aren't) diff --git a/test/test_torch.py b/test/test_torch.py index 4ace1012167d..55f5ee73c187 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6723,12 +6723,6 @@ def inner(self, device, dtype): ('remainder', 'negative_tensor', _small_3d, lambda t, d: [0 - _small_3d(t, d, has_zeros=False)], 1e-1, 1e-2, 1e-5, _signed_types), - ('std', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False), - ('std', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False), - ('std', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False), - ('var', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False), - ('var', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False), - ('var', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False), ('ndimension', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('nelement', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('numel', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 90aa1468bd4a..8be600914d97 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -845,6 +845,21 @@ def sample_inputs_linalg_solve(op_info, device, dtype, requires_grad=False): return out +def sample_inputs_std_var(op_info, device, dtype, requires_grad): + tensor_nd = make_tensor((S, S, S), device=device, dtype=dtype, + low=None, high=None, requires_grad=requires_grad) + tensor_1d = make_tensor((S,), device=device, dtype=dtype, + low=None, high=None, requires_grad=requires_grad) + + return [ + SampleInput(tensor_nd), + SampleInput(tensor_nd, kwargs=dict(dim=1)), + SampleInput(tensor_nd, kwargs=dict(dim=1, unbiased=True, keepdim=True)), + SampleInput(tensor_1d, kwargs=dict(dim=0, unbiased=True, keepdim=True)), + SampleInput(tensor_1d, kwargs=dict(dim=0, unbiased=False, keepdim=False)), + ] + + def _sample_inputs_svd(op_info, device, dtype, requires_grad=False, is_linalg_svd=False): """ This function generates input for torch.svd with distinct singular values so that autograd is always stable. @@ -1437,6 +1452,18 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad): SkipInfo('TestCommon', 'test_variant_consistency_jit', device_type='cuda', dtypes=[torch.float16]), )), + OpInfo('std', + dtypes=floating_types_and(), + dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_std_var, + supports_tensor_out=False, + test_complex_grad=False, + test_inplace_grad=False, + # std has only partial support for complex and half (#51127) + skips=(SkipInfo('TestOpInfo', 'test_unsupported_dtypes', + dtypes=[torch.half, torch.complex64, torch.complex128]),), + assert_autodiffed=True, + ), UnaryUfuncInfo('tan', ref=np.tan, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), @@ -1726,6 +1753,18 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad): supports_tensor_out=False, test_inplace_grad=False, sample_inputs_func=sample_repeat_tile), + OpInfo('var', + dtypes=floating_types_and(), + dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_std_var, + supports_tensor_out=False, + test_complex_grad=False, + test_inplace_grad=False, + # var has only partial support for complex and half (#51127) + skips=(SkipInfo('TestOpInfo', 'test_unsupported_dtypes', + dtypes=[torch.half, torch.complex64, torch.complex128]),), + assert_autodiffed=True, + ), ] if TEST_SCIPY: @@ -2294,16 +2333,6 @@ def method_tests(): ('prod', (torch.tensor(0., requires_grad=True)), NO_ARGS, 'scalar_zero'), ('prod', (torch.tensor(0., requires_grad=True)), (0,), 'scalar_dim_zero', (), [0]), ('prod', (torch.tensor(0., requires_grad=True)), (0, True,), 'scalar_keepdim_dim_zero', (), [0]), - ('var', (S, S, S), NO_ARGS, '', (True,)), - ('var', (S, S, S), (1,), 'dim', (True,), [0]), - ('var', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]), - ('var', (S,), (0,), 'dim_1d', (True,), [0]), - ('var', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]), - ('std', (S, S, S), NO_ARGS, '', (True,)), - ('std', (S, S, S), (1,), 'dim', (True,), [0]), - ('std', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]), - ('std', (S,), (0,), 'dim_1d', (True,), [0]), - ('std', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]), ('var_mean', (S, S, S), NO_ARGS, ''), ('var_mean', (S, S, S), (1,), 'dim', [0]), ('var_mean', (S, S, S), (1, True, True), 'keepdim_dim', [0]), From 00adc7b07f5f89af1c9d3db36c656233566b92e7 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 27 Jan 2021 10:49:10 -0800 Subject: [PATCH 06/41] Fix more JIT tests under Python-3.9 (#51182) Summary: Mostly replace `global Foo` with `make_global(Foo)` The only real fix is generating Subscript annotation, which is a follow up from https://github.com/pytorch/pytorch/pull/48676 Fixes https://github.com/pytorch/pytorch/issues/49617 Pull Request resolved: https://github.com/pytorch/pytorch/pull/51182 Reviewed By: gmagogsfm Differential Revision: D26095244 Pulled By: malfet fbshipit-source-id: 0e043d9a2cf43fff71dfbb341f708cd7af87c39a --- test/jit/test_class_type.py | 44 +++++++++++++------------- test/jit/test_enum.py | 63 +++++++++++++++---------------------- test/jit/test_with.py | 14 ++++----- test/test_jit.py | 6 ++-- torch/_jit_internal.py | 4 ++- 5 files changed, 62 insertions(+), 69 deletions(-) diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py index 4d3d73e5f7c7..3a5881f365c1 100644 --- a/test/jit/test_class_type.py +++ b/test/jit/test_class_type.py @@ -11,7 +11,7 @@ # Make the helper files in test/ importable pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) -from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.jit_utils import JitTestCase, make_global import torch.testing._internal.jit_utils from torch.testing._internal.common_utils import IS_SANDCASTLE from typing import List, Tuple, Iterable, Optional, Dict @@ -143,12 +143,12 @@ def __init__(self, x): self.attr = x def test_class_type_as_param(self): - global FooTest # see [local resolution in python] - class FooTest(object): # noqa: B903 def __init__(self, x): self.attr = x + make_global(FooTest) # see [local resolution in python] + @torch.jit.script def fn(foo: FooTest) -> torch.Tensor: return foo.attr @@ -279,13 +279,13 @@ def forward(self, a): self.assertEqual(2 * input, output) def test_python_interop(self): - global Foo # see [local resolution in python] - class Foo(object): # noqa: B903 def __init__(self, x, y): self.x = x self.y = y + make_global(Foo) # see [local resolution in python] + @torch.jit.script def use_foo(foo: Foo) -> Foo: return foo @@ -305,13 +305,13 @@ def use_foo(foo: Foo) -> Foo: self.assertEqual(y, f2.y) def test_class_specialization(self): - global Foo # see [local resolution in python] - class Foo(object): # noqa: B903 def __init__(self, x, y): self.x = x self.y = y + make_global(Foo) # see [local resolution in python] + def use_foo(foo: Foo, foo2: Foo, tup: Tuple[Foo, Foo]) -> torch.Tensor: a, b = tup return foo.x + foo2.y + a.x + b.y @@ -329,8 +329,6 @@ def use_foo(foo: Foo, foo2: Foo, tup: Tuple[Foo, Foo]) -> torch.Tensor: FileCheck().check_count("prim::GetAttr", 4).run(graphstr) def test_class_sorting(self): - global Foo # see [local resolution in python] - class Foo(object): # noqa: B903 def __init__(self, x: int) -> None: self.x = x @@ -342,6 +340,8 @@ def __lt__(self, other) -> bool: def getVal(self): return self.x + make_global(Foo) # see [local resolution in python] + def test(li: List[Foo], reverse: bool = False) -> Tuple[List[int], List[int]]: li_sorted = sorted(li) ret_sorted = torch.jit.annotate(List[int], []) @@ -500,8 +500,6 @@ def forward(self, a): self.assertEqual(3 * input, output) def test_interface(self): - global Foo, Bar, OneTwo, OneTwoThree, OneTwoWrong, NotMember, NotMember2 - @torch.jit.script class Foo(object): def __init__(self): @@ -571,6 +569,8 @@ def one(self, x, y): def two(self, x: int) -> int: return 3 + make_global(Foo, Bar, OneTwo, OneTwoThree, OneTwoWrong, NotMember, NotMember2) + def use_them(x): a = Foo() b = Bar() @@ -652,8 +652,6 @@ def __init__(self): # NamedTuple inheritance errors def test_overloaded_fn(self): - global Foo, MyClass # see [local resolution in python] - @torch.jit.script class Foo(object): def __init__(self, x): @@ -673,6 +671,8 @@ def test_overload(): a = Foo(torch.ones([3, 3])) return len(a), -a * torch.zeros([3, 3]) + make_global(Foo) # see [local resolution in python] + self.checkScript(test_overload, ()) # unary ops tested above @@ -737,6 +737,8 @@ def __call__(self, val: int) -> int: return self.x * val * 3 + make_global(Foo) # see [local resolution in python] + def add(): return MyClass(4) + 3 def sub(): # noqa: E306 @@ -787,8 +789,6 @@ def test(): return Foo(torch.tensor(1)) + Foo(torch.tensor(1)) def test_cast_overloads(self): - global Foo # see [local resolution in python] - @torch.jit.script class Foo(object): def __init__(self, val: float) -> None: @@ -806,6 +806,8 @@ def __bool__(self): def __str__(self): return str(self.val) + make_global(Foo) # see [local resolution in python] + def test(foo: Foo) -> Tuple[int, float, bool]: if foo: pass @@ -914,8 +916,6 @@ def forward(self, x): self.assertEqual(m.w, m_loaded.w) def test_py_class_to_ivalue_missing_attribute(self): - global Foo # see [local resolution in python] - class Foo(object): i : int f : float @@ -924,6 +924,8 @@ def __init__(self, i : int, f : float): self.i = i self.f = f + make_global(Foo) # see [local resolution in python] + @torch.jit.script def test_fn(x : Foo) -> float: return x.i + x.f @@ -1132,8 +1134,6 @@ def test_staticmethod(self): """ Test static methods on class types. """ - global ClassWithStaticMethod - @torch.jit.script class ClassWithStaticMethod: def __init__(self, a: int, b: int): @@ -1164,14 +1164,14 @@ def create_from(a: int, b: int) -> 'ClassWithStaticMethod': def test_function(a: int, b: int) -> 'ClassWithStaticMethod': return ClassWithStaticMethod.create_from(a, b) + make_global(ClassWithStaticMethod) + self.checkScript(test_function, (1, 2)) def test_classmethod(self): """ Test classmethods on class types. """ - global ClassWithClassMethod - @torch.jit.script class ClassWithClassMethod: def __init__(self, a: int): @@ -1184,6 +1184,8 @@ def __eq__(self, other: 'ClassWithClassMethod'): def create(cls, a: int) -> 'ClassWithClassMethod': return cls(a) + make_global(ClassWithClassMethod) + def test_function(a: int) -> 'ClassWithClassMethod': x = ClassWithClassMethod(a) # Support calling classmethod with an instance diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py index b39732d0e9bc..1a5f79d6e3ca 100644 --- a/test/jit/test_enum.py +++ b/test/jit/test_enum.py @@ -9,7 +9,7 @@ # Make the helper files in test/ importable pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) -from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.jit_utils import JitTestCase, make_global if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" @@ -18,24 +18,20 @@ class TestEnum(JitTestCase): def test_enum_value_types(self): - global IntEnum - class IntEnum(Enum): FOO = 1 BAR = 2 - global FloatEnum - class FloatEnum(Enum): FOO = 1.2 BAR = 2.3 - global StringEnum - class StringEnum(Enum): FOO = "foo as in foo bar" BAR = "bar as in foo bar" + make_global(IntEnum, FloatEnum, StringEnum) + @torch.jit.script def supported_enum_types(a: IntEnum, b: FloatEnum, c: StringEnum): return (a.name, b.name, c.name) @@ -46,12 +42,12 @@ def supported_enum_types(a: IntEnum, b: FloatEnum, c: StringEnum): .check("StringEnum") \ .run(str(supported_enum_types.graph)) - global TensorEnum - class TensorEnum(Enum): FOO = torch.tensor(0) BAR = torch.tensor(1) + make_global(TensorEnum) + def unsupported_enum_types(a: TensorEnum): return a.name @@ -59,12 +55,12 @@ def unsupported_enum_types(a: TensorEnum): torch.jit.script(unsupported_enum_types) def test_enum_comp(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def enum_comp(x: Color, y: Color) -> bool: return x == y @@ -75,8 +71,6 @@ def enum_comp(x: Color, y: Color) -> bool: self.assertEqual(enum_comp(Color.RED, Color.GREEN), False) def test_enum_comp_diff_classes(self): - global Foo, Bar - class Foo(Enum): ITEM1 = 1 ITEM2 = 2 @@ -85,6 +79,8 @@ class Bar(Enum): ITEM1 = 1 ITEM2 = 2 + make_global(Foo, Bar) + @torch.jit.script def enum_comp(x: Foo) -> bool: return x == Bar.ITEM1 @@ -98,12 +94,12 @@ def enum_comp(x: Foo) -> bool: self.assertEqual(enum_comp(Foo.ITEM1), False) def test_heterogenous_value_type_enum_error(self): - global Color - class Color(Enum): RED = 1 GREEN = "green" + make_global(Color) + def enum_comp(x: Color, y: Color) -> bool: return x == y @@ -111,12 +107,12 @@ def enum_comp(x: Color, y: Color) -> bool: torch.jit.script(enum_comp) def test_enum_name(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def enum_name(x: Color) -> str: return x.name @@ -131,12 +127,12 @@ def enum_name(x: Color) -> str: self.assertEqual(enum_name(Color.GREEN), Color.GREEN.name) def test_enum_value(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def enum_value(x: Color) -> int: return x.value @@ -151,12 +147,12 @@ def enum_value(x: Color) -> int: self.assertEqual(enum_value(Color.GREEN), Color.GREEN.value) def test_enum_as_const(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def enum_const(x: Color) -> bool: return x == Color.RED @@ -171,12 +167,12 @@ def enum_const(x: Color) -> bool: self.assertEqual(enum_const(Color.GREEN), False) def test_non_existent_enum_value(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + def enum_const(x: Color) -> bool: if x == Color.PURPLE: return True @@ -187,12 +183,12 @@ def enum_const(x: Color) -> bool: torch.jit.script(enum_const) def test_enum_ivalue_type(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def is_color_enum(x: Any): return isinstance(x, Color) @@ -207,8 +203,6 @@ def is_color_enum(x: Any): self.assertEqual(is_color_enum(1), False) def test_closed_over_enum_constant(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 @@ -240,8 +234,6 @@ def closed_over_aliased_value(): self.assertEqual(closed_over_aliased_value(), Color.RED.value) def test_enum_as_module_attribute(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 @@ -268,8 +260,6 @@ def forward(self): self.assertEqual(scripted(), Color.RED.value) def test_string_enum_as_module_attribute(self): - global Color - class Color(Enum): RED = "red" GREEN = "green" @@ -282,18 +272,19 @@ def __init__(self, e: Color): def forward(self): return (self.e.name, self.e.value) + make_global(Color) m = TestModule(Color.RED) scripted = torch.jit.script(m) self.assertEqual(scripted(), (Color.RED.name, Color.RED.value)) def test_enum_return(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 + make_global(Color) + @torch.jit.script def return_enum(cond: bool): if cond: @@ -305,8 +296,6 @@ def return_enum(cond: bool): self.assertEqual(return_enum(False), Color.GREEN) def test_enum_module_return(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 @@ -319,6 +308,7 @@ def __init__(self, e: Color): def forward(self): return self.e + make_global(Color) m = TestModule(Color.RED) scripted = torch.jit.script(m) @@ -333,8 +323,6 @@ def forward(self): def test_enum_iterate(self): - global Color - class Color(Enum): RED = 1 GREEN = 2 @@ -347,6 +335,7 @@ def iterate_enum(x: Color): res.append(e.value) return res + make_global(Color) scripted = torch.jit.script(iterate_enum) FileCheck() \ diff --git a/test/jit/test_with.py b/test/jit/test_with.py index f958dc46c39a..35ff5b959737 100644 --- a/test/jit/test_with.py +++ b/test/jit/test_with.py @@ -4,7 +4,7 @@ from typing import Any, List import torch -from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.jit_utils import JitTestCase, make_global # Make the helper files in test/ importable @@ -29,8 +29,6 @@ def test_with_as(self): Check that with statements that use the 'as' keyword to bind expressions to targets work as expected. """ - global Context - @torch.jit.script class Context(object): """ @@ -50,6 +48,8 @@ def __enter__(self): def __exit__(self, type: Any, value: Any, tb: Any): self.count.sub_(0.3) + make_global(Context) + def test_basic(x: torch.Tensor) -> torch.Tensor: """Basic test with one with-statement.""" @@ -185,8 +185,6 @@ def test_with_no_as(self): Check that with statements that do not use the 'as' keyword to bind expressions to targets work as expected. """ - global Context - @torch.jit.script class Context(object): """ @@ -206,6 +204,8 @@ def __enter__(self): def __exit__(self, type: Any, value: Any, tb: Any): self.count.sub_(0.3) + make_global(Context) + def test_basic(x: torch.Tensor) -> torch.Tensor: """Basic test with one with-statement.""" @@ -341,8 +341,6 @@ def test_with_exceptions(self): Check that exceptions thrown in the bodies of with-statements are handled correctly. """ - global Context - @torch.jit.script class Context(object): """ @@ -362,6 +360,8 @@ def __enter__(self): def __exit__(self, type: Any, value: Any, tb: Any): self.count.sub_(0.3) + make_global(Context) + @torch.jit.script def method_that_raises() -> torch.Tensor: raise Exception("raised exception") diff --git a/test/test_jit.py b/test/test_jit.py index 4d37cd0a3ef9..e745f898824a 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -65,7 +65,7 @@ freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \ enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \ - _trace, enable_cpu_fuser_if, do_input_map, get_execution_plan, \ + _trace, enable_cpu_fuser_if, do_input_map, get_execution_plan, make_global, \ execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \ RUN_CUDA from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, nn_functional_tests, get_script_args, \ @@ -6609,8 +6609,6 @@ def bar(c, b): .check("in foo").check("in baz").run(str(cm.exception)) def test_error_stacktrace_interface(self): - global IFace - @torch.jit.script def baz(c, b): return c + b @@ -6634,6 +6632,8 @@ def one(self, x, y): # type: (Tensor, Tensor) -> Tensor pass + make_global(IFace) + @torch.jit.script def as_interface(x): # type: (IFace) -> IFace diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index be287d0a9a3b..f4dabcbf97de 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -241,7 +241,9 @@ def get_annotation_str(annotation): elif isinstance(annotation, ast.Attribute): return '.'.join([get_annotation_str(annotation.value), annotation.attr]) elif isinstance(annotation, ast.Subscript): - return f"{get_annotation_str(annotation.value)}[{get_annotation_str(annotation.slice.value)}]" # type: ignore + # In Python3.9+ subscript indicies are not wrapped in ast.Index + subscript_slice = annotation.slice if sys.version_info >= (3, 9) else annotation.slice.value # type: ignore + return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]" elif isinstance(annotation, ast.Tuple): return ','.join([get_annotation_str(elt) for elt in annotation.elts]) elif isinstance(annotation, ast.Constant) or isinstance(annotation, ast.NameConstant): From d3ec204ef215fca6888d92e18ffa9db126d5b837 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 27 Jan 2021 11:17:53 -0800 Subject: [PATCH 07/41] [quant][graphmode][fx] Add functional conv2d + relu (#51079) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51079 Added support for functional conv2d + relu, will add conv1d and conv3d in future PR Test Plan: python test/test_quantization.py TestQuantizeFxOps.test_functional_conv Imported from OSS Reviewed By: vkuzo Differential Revision: D26089964 fbshipit-source-id: 8703de17de1469f7076651c386c83fb5922a56eb --- test/quantization/test_quantize_fx.py | 80 ++++++++++++++--- .../quantization/fx/quantization_patterns.py | 88 ++++++++++++++----- 2 files changed, 136 insertions(+), 32 deletions(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 8d7ba7dcb4e6..b2243eead1d0 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -1497,7 +1497,7 @@ def forward(self, x): self.checkGraphModeFxOp(model, data, quant_type, quantized_node) @skipIfNoFBGEMM - def test_linear_functional(self): + def test_functional_linear(self): class FuncLinear(torch.nn.Module): def __init__(self, use_bias, has_relu, f_relu): super(FuncLinear, self).__init__() @@ -1595,22 +1595,80 @@ def forward(self, x): quantized_nodes[dim]) @skipIfNoFBGEMM - def test_conv2d_functional(self): - for bias in [True, False]: - conv = torch.nn.Conv2d(1, 1, 1, bias=bias) + def test_functional_conv(self): + """ Test for function conv and functional conv + relu + """ + class FuncConv(torch.nn.Module): + def __init__(self, use_bias, has_relu, f_relu): + super().__init__() + self.w = torch.randn(3, 3, 3, 3) + self.b = torch.randn(3) if use_bias else None + self.stride = (1, 1) + self.padding = (0, 0) + self.dilation = (1, 1) + self.groups = 1 + self.use_bias = use_bias + if has_relu: + if f_relu: + self.relu = F.relu + else: + self.relu = torch.nn.ReLU() + else: + self.relu = torch.nn.Identity() + + def forward(self, x): + x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups) + x = self.relu(x) + return x + + data = (torch.randn((2, 3, 4, 4), dtype=torch.float),) + + quant_type_to_prepare_expected_node_occurrence = { + QuantType.DYNAMIC: {}, # There should be 3 observers: after input, weight and activation. - # No observer after bias. - prepare_expected_node_occurrence = { + QuantType.STATIC: { ns.call_module(torch.quantization.HistogramObserver): 2, ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1, + }, + # There should be 3 observers: after input, weight and activation. + QuantType.QAT: { + ns.call_module(torch.quantization.FakeQuantize): 3, + }, + } + quant_type_to_qconv_fun = { + QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d), + QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d), + } + quant_type_to_qconv_relu_fun = { + QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu), + QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu), + } + + options = itertools.product( + self.static_quant_types, + (True, False), # use_bias + (True, False), # has_relu + (True, False), # functional relu + ) + for quant_type, use_bias, has_relu, f_relu in options: + model = FuncConv(use_bias, has_relu, f_relu) + if has_relu: + qconv_fun = quant_type_to_qconv_relu_fun[quant_type] + else: + qconv_fun = quant_type_to_qconv_fun[quant_type] + + convert_node_occurrence = { + ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0, + qconv_fun: 1, + ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0 } - expected_node_occurrence = \ - {ns.call_function(torch.ops.quantized.conv2d): 1} + prepare_expected_node_occurrence = \ + quant_type_to_prepare_expected_node_occurrence[quant_type] self.checkGraphModeFxOp( - conv, (torch.randn(4, 1, 4, 4),), QuantType.STATIC, + model, data, quant_type, qconv_fun, prepare_expected_node_occurrence=prepare_expected_node_occurrence, - expected_node_occurrence=expected_node_occurrence, - ) + expected_node_occurrence=convert_node_occurrence) + @skipIfNoFBGEMM def test_quantized_conv_relu(self): diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index e6ac74dbf903..06f15240e761 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -218,12 +218,35 @@ def __init__(self, quantizer: QuantizerCls, node: Node): def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, debug: bool = False, convert_custom_config_dict: Dict[str, Any] = None) -> Node: + # Supported combinations are: + # quant_type | activation (compute_type) | weight + # static quint8 qint8 + + # tuple (activation_dtype, weight_dtype, compute_dtype) + supported_dtypes = [ + (torch.quint8, torch.qint8, None), + ] + # TODO: debug option for conv module qconfig = quantizer.qconfig_map[node.name] + dtypes = get_qconfig_dtypes(qconfig) + # leave the op unquantized if the dtype combination is not supported + if dtypes not in supported_dtypes: + warnings.warn( + "dtype combination: {} is not " + "supported by Conv " + "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) + if self.relu_node: + conv_out = quantizer.quantized_graph.node_copy(self.conv_node, load_arg(quantized=False)) + relu_args = [conv_out] + relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:])) + relu_kwargs = load_arg(quantized=False)(self.relu_node.kwargs) + return quantizer.quantized_graph.create_node( + "call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs) + else: + return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False)) + activation_statically_quantized = activation_is_statically_quantized(qconfig) - # only static qunatization (for both ptq and qat) is supported for conv - if not activation_statically_quantized: - return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None)) if self.conv_node.op == 'call_module': # note that relu should already be fused into conv module in the fusion step @@ -246,21 +269,32 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, (load_arg(quantized=True)(self.conv_node.args[0]),), {}) else: # call_function - assert self.conv_node.op == 'call_function' - if self.relu_node is not None: - raise Exception("functional conv + relu is not supported yet") + assert self.conv_node.op == "call_function" if debug: args = load_arg(quantized=[0, 1])(self.conv_node.args) args = load_arg(quantized=False)(self.conv_node.args) kwargs = load_arg(quantized=False)(self.conv_node.kwargs) - conv_out = quantizer.quantized_graph.create_node( - 'call_function', torch.nn.functional.conv2d, args, kwargs) - root_module = quantizer.modules[''] - return quantize_node( - root_module, quantizer.quantized_graph, conv_out, quantizer.activation_post_process_map[self.conv_node.name]) + op_out = quantizer.quantized_graph.create_node( + "call_function", torch.nn.functional.conv2d, args, kwargs) + if self.relu_node: + relu_args = [op_out] + relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:])) + relu_kwargs = load_arg(quantized=False)(self.relu_node.kwargs) + op_out = quantizer.quantized_graph.create_node( + "call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs) + + if activation_statically_quantized: + root_module = quantizer.modules[''] + act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name + return quantize_node( + root_module, quantizer.quantized_graph, op_out, + quantizer.activation_post_process_map[act_post_process_name]) + else: + # output for dynamically quantized conv op is not quantized + return op_out else: - assert len(self.conv_node.args) == 7, \ - 'only conv2d calls with all arguments specified is support right now in debug=False option' + assert len(self.conv_node.args) >= 7, \ + "only conv2d calls with all arguments specified is supported right now in debug=False option" args = load_arg(quantized=[0, 1])(self.conv_node.args) # pack weight weight = load_arg(quantized=True)(self.conv_node.args[1]) @@ -268,14 +302,23 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, prepack_args = tuple([weight] + list(other_args)) packed_weight = quantizer.quantized_graph.create_node( 'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {}) + assert activation_statically_quantized, \ + "currently only static quantization is supported for conv" # construct conv input - conv_input = load_arg(quantized=True)(self.conv_node.args[0]) - activation_post_process = quantizer.activation_post_process_map[self.conv_node.name] - scale, zero_point, _ = get_per_tensor_qparams(activation_post_process) - qconv_args = (conv_input, packed_weight, scale, zero_point) - kwargs = load_arg(quantized=False)(self.conv_node.kwargs) - return quantizer.quantized_graph.create_node( - 'call_function', torch.ops.quantized.conv2d, qconv_args, kwargs) + if activation_statically_quantized: + qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d + conv_input = load_arg(quantized=True)(self.conv_node.args[0]) + act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name + activation_post_process = quantizer.activation_post_process_map[act_post_process_name] + scale, zero_point, _ = get_per_tensor_qparams(activation_post_process) + qconv_args = (conv_input, packed_weight, scale, zero_point) + kwargs = load_arg(quantized=False)(self.conv_node.kwargs) + return quantizer.quantized_graph.create_node( + 'call_function', qconv_op, qconv_args, kwargs) + else: + # conv2d_dyanmic branch + raise Exception("Only static quant is supported for conv") + # handle linear, maybe followed by relu @register_quant_pattern(torch.nn.Linear) @@ -316,6 +359,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, ] qconfig = quantizer.qconfig_map[node.name] dtypes = get_qconfig_dtypes(qconfig) + # leave the op unquantized if the dtype combination is not supported if dtypes not in supported_dtypes: warnings.warn( "dtype combination: {} is not " @@ -412,9 +456,9 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig)) packed_weight = quantizer.quantized_graph.create_node( 'call_function', prepack_op, prepack_args, {}) - qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear # construct linear input if activation_statically_quantized: + qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear linear_input = load_arg(quantized=True)(self.linear_node.args[0]) act_post_process_name = self.relu_node.name if self.relu_node else self.linear_node.name activation_post_process = \ @@ -484,6 +528,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, emb_node = node qconfig = quantizer.qconfig_map[node.name] dtypes = get_qconfig_dtypes(qconfig) + # leave the op unquantized if the dtype combination is not supported if dtypes not in supported_dtypes: warnings.warn( "dtype combination: {} is not " @@ -527,6 +572,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, assert node.op == 'call_module' qconfig = quantizer.qconfig_map[node.name] dtypes = get_qconfig_dtypes(qconfig) + # leave the op unquantized if the dtype combination is not supported if dtypes not in supported_dtypes: warnings.warn( "dtype combination: {} is not " From b77f72b5a0d7ccf30454a8b79c5fddd9b069e6cf Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Wed, 27 Jan 2021 11:41:58 -0800 Subject: [PATCH 08/41] Enable TensorPipe's SHM transport (#50760) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50760 The SHM transport uses shared-memory-backed ringbuffers to transfer small payloads between processes on the same machine. It was disabled in v1.6 due to a CMake mishap but we've since realized that it also doesn't work that well in docker and other setups. Enabling it here to see whether CircleCI fails. ghstack-source-id: 120470890 Test Plan: Exported three times to CircleCI with tests consistently passing Reviewed By: mrshenli Differential Revision: D23814828 fbshipit-source-id: f355cb6515776debad536924de4f4d3fbb05a874 --- .circleci/config.yml | 2 +- .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml | 2 +- cmake/Dependencies.cmake | 2 -- torch/csrc/distributed/rpc/tensorpipe_agent.cpp | 6 ++++++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4cf4dc4e2c6a..3fceba2db8dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -574,7 +574,7 @@ jobs: hostname export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) else - export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) fi echo "id=${id}" >> "${BASH_ENV}" diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index 8cbb9a4e3f40..99b327c275a0 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -133,7 +133,7 @@ jobs: hostname export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) else - export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) fi echo "id=${id}" >> "${BASH_ENV}" diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index e138a86c61da..75afbb8b6cf4 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1347,7 +1347,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE) set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE) endif() set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE) - set(TP_ENABLE_SHM OFF CACHE BOOL "" FORCE) set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE) add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe) @@ -1851,4 +1850,3 @@ if(USE_KINETO) set(USE_KINETO OFF) endif() endif() - diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 4f56c916cb98..0417577d2499 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -14,6 +14,12 @@ #include #endif +#if TENSORPIPE_HAS_SHM_TRANSPORT +// Needed for ::getpid(), which is used to create a unique address. +#include +#include +#endif + namespace torch { namespace distributed { namespace rpc { From 1b7a4f9cde8a214c8d0b1d9753f42c0a7406f3eb Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Wed, 27 Jan 2021 11:44:18 -0800 Subject: [PATCH 09/41] .github: Add GitHub Actions workflow to build wheels (#50633) Summary: Signed-off-by: Eli Uriegas Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/50633 Reviewed By: samestep Differential Revision: D26083492 Pulled By: seemethere fbshipit-source-id: c133671b9cf5074539133ee79fca5c680793a85d --- .../scripts/generate_binary_build_matrix.py | 86 +++++++++++++ .github/scripts/generate_pytorch_version.py | 118 ++++++++++++++++++ .github/workflows/build_linux_binaries.yml | 86 +++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 .github/scripts/generate_binary_build_matrix.py create mode 100755 .github/scripts/generate_pytorch_version.py create mode 100644 .github/workflows/build_linux_binaries.yml diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py new file mode 100644 index 000000000000..d95518bc6dae --- /dev/null +++ b/.github/scripts/generate_binary_build_matrix.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +"""Generates a matrix to be utilized through github actions + +Will output a condensed version of the matrix if on a pull request that only +includes the latest version of python we support built on three different +architectures: + * CPU + * Latest CUDA + * Latest ROCM +""" + +import json +import os +import itertools + +CUDA_ARCHES = [ + "10.1", + "10.2", + "11.0" +] + +ROCM_ARCHES = [ + "3.10", + "4.0" +] + +FULL_ARCHES = [ + "cpu", + *CUDA_ARCHES, + *ROCM_ARCHES +] + +CONTAINER_IMAGES = { + **{ + # TODO: Re-do manylinux CUDA image tagging scheme to be similar to + # ROCM so we don't have to do this replacement + gpu_arch: f"pytorch/manylinux-cuda{gpu_arch.replace('.', '')}" + for gpu_arch in CUDA_ARCHES + }, + **{ + gpu_arch: f"pytorch/manylinux-rocm:{gpu_arch}" + for gpu_arch in ROCM_ARCHES + }, + "cpu": "pytorch/manylinux-cpu" +} + +FULL_PYTHON_VERSIONS = [ + "3.6", + "3.7", + "3.8", + "3.9", +] + + +def is_pull_request(): + return os.environ.get("GITHUB_HEAD_REF") + +def generate_matrix(): + python_versions = FULL_PYTHON_VERSIONS + arches = FULL_ARCHES + if is_pull_request(): + python_versions = [python_versions[-1]] + arches = ["cpu", CUDA_ARCHES[-1], ROCM_ARCHES[-1]] + matrix = [] + for item in itertools.product(python_versions, arches): + python_version, arch_version = item + # Not my favorite code here + gpu_arch_type = "cuda" + if "rocm" in CONTAINER_IMAGES[arch_version]: + gpu_arch_type = "rocm" + elif "cpu" in CONTAINER_IMAGES[arch_version]: + gpu_arch_type = "cpu" + matrix.append({ + "python_version": python_version, + "gpu_arch_type": gpu_arch_type, + "gpu_arch_version": arch_version, + "container_image": CONTAINER_IMAGES[arch_version] + }) + return json.dumps({"include": matrix}) + +def main(): + print(generate_matrix()) + +if __name__ == "__main__": + main() diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py new file mode 100755 index 000000000000..93fc4ca6db3a --- /dev/null +++ b/.github/scripts/generate_pytorch_version.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess +import re + +from datetime import datetime +from distutils.util import strtobool +from pathlib import Path + +LEADING_V_PATTERN = re.compile("^v") +TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$") +LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$") + +class NoGitTagException(Exception): + pass + +def get_pytorch_root(): + return Path(subprocess.check_output( + ['git', 'rev-parse', '--show-toplevel'] + ).decode('ascii').strip()) + +def get_tag(): + root = get_pytorch_root() + # We're on a tag + am_on_tag = ( + subprocess.run( + ['git', 'describe', '--tags', '--exact'], + cwd=root, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ).returncode == 0 + ) + tag = "" + if am_on_tag: + dirty_tag = subprocess.check_output( + ['git', 'describe'], + cwd=root + ).decode('ascii').strip() + # Strip leading v that we typically do when we tag branches + # ie: v1.7.1 -> 1.7.1 + tag = re.sub(LEADING_V_PATTERN, "", dirty_tag) + # Strip trailing rc pattern + # ie: 1.7.1-rc1 -> 1.7.1 + tag = re.sub(TRAILING_RC_PATTERN, "", tag) + return tag + +def get_base_version(): + root = get_pytorch_root() + dirty_version = open(root / 'version.txt', 'r').read().strip() + # Strips trailing a0 from version.txt, not too sure why it's there in the + # first place + return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version) + +class PytorchVersion: + def __init__(self, gpu_arch_type, gpu_arch_version, no_build_suffix): + self.gpu_arch_type = gpu_arch_type + self.gpu_arch_version = gpu_arch_version + self.no_build_suffix = no_build_suffix + + def get_post_build_suffix(self): + # CUDA 10.2 is the version to be uploaded to PyPI so it doesn't have a + # version suffix + if ((self.gpu_arch_type == "cuda" and self.gpu_arch_version == "10.2") + or self.no_build_suffix): + return "" + if self.gpu_arch_type == "cuda": + return f"+cu{self.gpu_arch_version.replace('.', '')}" + return f"+{self.gpu_arch_type}{self.gpu_arch_version}" + + def get_release_version(self): + if not get_tag(): + raise NoGitTagException( + "Not on a git tag, are you sure you want a release version?" + ) + return f"{get_tag()}{self.get_post_build_suffix()}" + + def get_nightly_version(self): + date_str = datetime.today().strftime('%Y%m%d') + build_suffix = self.get_post_build_suffix() + return f"{get_base_version()}.dev{date_str}{build_suffix}" + +def main(): + parser = argparse.ArgumentParser( + description="Generate pytorch version for binary builds" + ) + parser.add_argument( + "--no-build-suffix", + type=strtobool, + help="Whether or not to add a build suffix typically (+cpu)", + default=os.environ.get("NO_BUILD_SUFFIX", False) + ) + parser.add_argument( + "--gpu-arch-type", + type=str, + help="GPU arch you are building for, typically (cpu, cuda, rocm)", + default=os.environ.get("GPU_ARCH_TYPE", "cpu") + ) + parser.add_argument( + "--gpu-arch-version", + type=str, + help="GPU arch version, typically (10.2, 4.0), leave blank for CPU", + default=os.environ.get("GPU_ARCH_VERSION", "") + ) + args = parser.parse_args() + version_obj = PytorchVersion( + args.gpu_arch_type, + args.gpu_arch_version, + args.no_build_suffix + ) + try: + print(version_obj.get_release_version()) + except NoGitTagException: + print(version_obj.get_nightly_version()) + +if __name__ == "__main__": + main() diff --git a/.github/workflows/build_linux_binaries.yml b/.github/workflows/build_linux_binaries.yml new file mode 100644 index 000000000000..fc8917d74625 --- /dev/null +++ b/.github/workflows/build_linux_binaries.yml @@ -0,0 +1,86 @@ +name: Build Linux Wheels + +on: + # TODO: These are only runnable from workflow_dispatch, we need to eventually add + # a cron + # TODO: Add an on_release trigger to build on tags + workflow_dispatch: + +jobs: + generate-build-matrix: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + container: + image: python:3.9 + steps: + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python .github/scripts/generate_binary_build_matrix.py + MATRIX=$(python .github/scripts/generate_binary_build_matrix.py) + echo "::set-output name=matrix::${MATRIX}" + build-wheel: + if: ${{ github.repository_owner == 'pytorch' }} + needs: generate-build-matrix + runs-on: linux.2xlarge + strategy: + matrix: + ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }} + container: + image: ${{ matrix.container_image }} + env: + DESIRED_PYTHON: ${{ matrix.python_version }} + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: ${{ matrix.gpu_arch_version }} + GPU_ARCH_VERSION: ${{ matrix.GPU_ARCH_VERSION }} + GPU_ARCH_TYPE: ${{ matrix.gpu_arch_type }} + PYTORCH_BUILD_NUMBER: 1 + SKIP_ALL_TESTS: 1 + steps: + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + with: + path: pytorch + submodules: recursive + - name: Clone pytorch/builder + uses: actions/checkout@v2 + with: + repository: pytorch/builder + path: builder + - name: Generate version string + working-directory: pytorch/ + run: | + version=$(.github/scripts/generate_pytorch_version.py) + echo "Generated version: ${version}" + echo "PYTORCH_BUILD_VERSION=${version}" >> $GITHUB_ENV + # TODO: Remove this once we remove the need for the directories to be + # in specific locations + - name: Symlink repositories to root directory (for legacy scripts purposes) + run: | + ln -s $(pwd)/pytorch /pytorch + ln -s $(pwd)/builder /builder + # TODO: Bundle the correct build script in the base container image so + # that we don't have to do this type of specification + - name: Build PyTorch binary (CUDA specific) + if: ${{ matrix.gpu_arch_type == 'cuda' }} + run: | + /builder/manywheel/build.sh + - name: Build PyTorch binary (ROCM specific) + if: ${{ matrix.gpu_arch_type == 'rocm' }} + run: | + /builder/manywheel/build_rocm.sh + - name: Build PyTorch binary (CPU specific) + if: ${{ matrix.gpu_arch_type == 'cpu' }} + run: | + /builder/manywheel/build_cpu.sh + - uses: actions/upload-artifact@v2 + with: + name: pytorch-wheel-py${{ matrix.python_version }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }} + path: /remote/**/*.whl + # TODO: Add a step here for uploading binaries From dd1a97b3ae8a14a5c167fd97273531d2b5ad566b Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 27 Jan 2021 11:54:21 -0800 Subject: [PATCH 10/41] [quant][graphmode][fx] Add support for functional conv1d and conv3d (#51155) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51155 This PR added support for quantizing functional conv1d, conv3d, conv1d_relu and conv3d_relu Test Plan: python test/test_quantization.py TestQuantizeFxOps.test_functional_conv Imported from OSS Reviewed By: vkuzo Differential Revision: D26089965 fbshipit-source-id: 4aea507d05b744807e993f6d3711ab308fb7591b --- test/quantization/test_quantize_fx.py | 60 +++++++++++++------ .../quantization/fx/quantization_patterns.py | 20 +++++-- torch/quantization/fx/utils.py | 26 ++++++++ 3 files changed, 84 insertions(+), 22 deletions(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index b2243eead1d0..295144bddc3f 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -1598,14 +1598,21 @@ def forward(self, x): def test_functional_conv(self): """ Test for function conv and functional conv + relu """ + convs = { + 1: torch.nn.functional.conv1d, + 2: torch.nn.functional.conv2d, + 3: torch.nn.functional.conv3d, + } + class FuncConv(torch.nn.Module): - def __init__(self, use_bias, has_relu, f_relu): + def __init__(self, dim, use_bias, has_relu, f_relu): super().__init__() - self.w = torch.randn(3, 3, 3, 3) + self.dim = dim + self.w = torch.randn(tuple([3] * (dim + 2))) self.b = torch.randn(3) if use_bias else None - self.stride = (1, 1) - self.padding = (0, 0) - self.dilation = (1, 1) + self.stride = tuple([1] * dim) + self.padding = tuple([0] * dim) + self.dilation = tuple([1] * dim) self.groups = 1 self.use_bias = use_bias if has_relu: @@ -1617,12 +1624,10 @@ def __init__(self, use_bias, has_relu, f_relu): self.relu = torch.nn.Identity() def forward(self, x): - x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups) + x = convs[self.dim](x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups) x = self.relu(x) return x - data = (torch.randn((2, 3, 4, 4), dtype=torch.float),) - quant_type_to_prepare_expected_node_occurrence = { QuantType.DYNAMIC: {}, # There should be 3 observers: after input, weight and activation. @@ -1636,31 +1641,50 @@ def forward(self, x): }, } quant_type_to_qconv_fun = { - QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d), - QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d), + QuantType.STATIC: { + 1: ns.call_function(torch.ops.quantized.conv1d), + 2: ns.call_function(torch.ops.quantized.conv2d), + 3: ns.call_function(torch.ops.quantized.conv3d) + }, + QuantType.QAT: { + 1: ns.call_function(torch.ops.quantized.conv1d), + 2: ns.call_function(torch.ops.quantized.conv2d), + 3: ns.call_function(torch.ops.quantized.conv3d) + }, } quant_type_to_qconv_relu_fun = { - QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu), - QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu), + QuantType.STATIC: { + 1: ns.call_function(torch.ops.quantized.conv1d_relu), + 2: ns.call_function(torch.ops.quantized.conv2d_relu), + 3: ns.call_function(torch.ops.quantized.conv3d_relu) + }, + QuantType.QAT: { + 1: ns.call_function(torch.ops.quantized.conv1d_relu), + 2: ns.call_function(torch.ops.quantized.conv2d_relu), + 3: ns.call_function(torch.ops.quantized.conv3d_relu) + }, } options = itertools.product( + [1, 2, 3], # dims self.static_quant_types, (True, False), # use_bias (True, False), # has_relu (True, False), # functional relu ) - for quant_type, use_bias, has_relu, f_relu in options: - model = FuncConv(use_bias, has_relu, f_relu) + for dim, quant_type, use_bias, has_relu, f_relu in options: + data_dims = [2, 3] + [4] * dim + data = (torch.randn(tuple(data_dims), dtype=torch.float),) + model = FuncConv(dim, use_bias, has_relu, f_relu) if has_relu: - qconv_fun = quant_type_to_qconv_relu_fun[quant_type] + qconv_fun = quant_type_to_qconv_relu_fun[quant_type][dim] else: - qconv_fun = quant_type_to_qconv_fun[quant_type] + qconv_fun = quant_type_to_qconv_fun[quant_type][dim] convert_node_occurrence = { - ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0, + ns.call_function(torch.quantize_per_tensor): 1, qconv_fun: 1, - ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0 + ns.call_method("dequantize"): 1 } prepare_expected_node_occurrence = \ quant_type_to_prepare_expected_node_occurrence[quant_type] diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 06f15240e761..7d0e3f7d0a78 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -32,6 +32,8 @@ quantize_node, get_per_tensor_qparams, get_linear_prepack_op_for_dtype, + get_qconv_prepack_op, + get_qconv_op, ) from .quantization_types import QuantizerCls @@ -188,7 +190,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, @register_quant_pattern(torch.nn.Conv1d) @register_quant_pattern(torch.nn.Conv2d) @register_quant_pattern(torch.nn.Conv3d) +@register_quant_pattern(torch.nn.functional.conv1d) @register_quant_pattern(torch.nn.functional.conv2d) +@register_quant_pattern(torch.nn.functional.conv3d) +# TODO: add qat.Conv1d and qat.Conv3d @register_quant_pattern(torch.nn.qat.Conv2d) @register_quant_pattern(torch.nn.intrinsic.ConvReLU1d) @register_quant_pattern(torch.nn.intrinsic.ConvReLU2d) @@ -198,8 +203,12 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d) @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d) @register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d) +@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d)) @register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d)) +@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d)) +@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d)) @register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d)) +@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d)) # just for error checks @register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d)) @register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d)) @@ -212,8 +221,10 @@ def __init__(self, quantizer: QuantizerCls, node: Node): self.relu_node = node node = node.args[0] # type: ignore self.conv_node = node - if node.op == 'call_module': + if node.op == "call_module": self.conv = quantizer.modules[self.conv_node.target] + elif node.op == "call_function": + self.conv = node.target def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, debug: bool = False, @@ -275,7 +286,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, args = load_arg(quantized=False)(self.conv_node.args) kwargs = load_arg(quantized=False)(self.conv_node.kwargs) op_out = quantizer.quantized_graph.create_node( - "call_function", torch.nn.functional.conv2d, args, kwargs) + "call_function", self.conv, args, kwargs) if self.relu_node: relu_args = [op_out] relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:])) @@ -300,13 +311,14 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, weight = load_arg(quantized=True)(self.conv_node.args[1]) other_args = load_arg(quantized=False)(self.conv_node.args[2:]) prepack_args = tuple([weight] + list(other_args)) + prepack_op = get_qconv_prepack_op(self.conv) packed_weight = quantizer.quantized_graph.create_node( - 'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {}) + "call_function", prepack_op, prepack_args, {}) assert activation_statically_quantized, \ "currently only static quantization is supported for conv" # construct conv input if activation_statically_quantized: - qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d + qconv_op = get_qconv_op(self.conv, self.relu_node is not None) conv_input = load_arg(quantized=True)(self.conv_node.args[0]) act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name activation_post_process = quantizer.activation_post_process_map[act_post_process_name] diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 8285e204b1ed..8d3445e46dba 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -179,6 +179,32 @@ def get_linear_prepack_op_for_dtype(dtype): else: raise Exception("can't get linear prepack op for dtype:", dtype) +def get_qconv_prepack_op(conv_op: Callable) -> Callable: + prepack_ops = { + torch.nn.functional.conv1d: torch.ops.quantized.conv1d_prepack, + torch.nn.functional.conv2d: torch.ops.quantized.conv2d_prepack, + torch.nn.functional.conv3d: torch.ops.quantized.conv3d_prepack + } + prepack_op = prepack_ops.get(conv_op, None) + assert prepack_op, "Didn't find prepack op for {}".format(conv_op) + return prepack_op + +def get_qconv_op(conv_op: Callable, has_relu: bool) -> Callable: + qconv_op = { + # has relu + True: { + torch.nn.functional.conv1d: torch.ops.quantized.conv1d_relu, + torch.nn.functional.conv2d: torch.ops.quantized.conv2d_relu, + torch.nn.functional.conv3d: torch.ops.quantized.conv3d_relu + }, + False: { + torch.nn.functional.conv1d: torch.ops.quantized.conv1d, + torch.nn.functional.conv2d: torch.ops.quantized.conv2d, + torch.nn.functional.conv3d: torch.ops.quantized.conv3d + } + } + return qconv_op[has_relu].get(conv_op) + # Returns a function that can get a new attribute name for module with given # prefix, for example, # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer') From 6d098095eb122685ae87b83a56b95771382a3ef6 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 27 Jan 2021 11:56:31 -0800 Subject: [PATCH 11/41] [numpy] torch.lgamma: promote integer inputs to float (#50140) Summary: Reference: https://github.com/pytorch/pytorch/issues/42515 Pull Request resolved: https://github.com/pytorch/pytorch/pull/50140 Reviewed By: mrshenli Differential Revision: D25951094 Pulled By: mruberry fbshipit-source-id: e53f1dbddff889710f05d43dbc9587382d3decb0 --- aten/src/ATen/native/UnaryOps.cpp | 35 ++------------- .../src/ATen/native/cuda/UnaryGammaKernels.cu | 2 +- aten/src/ATen/native/native_functions.yaml | 14 +++--- test/test_torch.py | 1 - test/test_unary_ufuncs.py | 2 - torch/csrc/jit/tensorexpr/kernel.cpp | 5 ++- .../_internal/common_methods_invocations.py | 44 +++++++++++++++++++ 7 files changed, 57 insertions(+), 46 deletions(-) diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index ca311f86091e..9ff806bd5054 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -650,38 +650,9 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) { return self.copy_(args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(c10::pi) / 4.)); } -// NB: If you use this macro, you may also need to add a CUDA forwarding -// stub in CUDAUnaryOps - -#define IMPLEMENT_UNARY_OP_CORE(op) \ - Tensor op(const Tensor& self) { \ - Tensor result = at::empty({0}, self.options()); \ - at::op##_out(result, self); \ - return result; \ - } - -#define IMPLEMENT_UNARY_OP_OUT_INPLACE(op, prefix, device) \ - Tensor& _##op##__##prefix(Tensor& self) { \ - return at::op##_out(self, self); \ - } \ - Tensor& _##op##_out_##prefix(Tensor& result, const Tensor& self) { \ - checkDeviceType(#op, result, DeviceType::device); \ - checkLayout(#op, result, Layout::Strided); \ - auto iter = TensorIterator::unary_op(result, self); \ - op##_stub(iter.device_type(), iter); \ - return result; \ - } - -#define IMPLEMENT_UNARY_OP_VEC(op) \ - IMPLEMENT_UNARY_OP_CORE(op) \ - IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cpu, CPU) - -#define IMPLEMENT_UNARY_OP_VEC_CUDA(op) \ - IMPLEMENT_UNARY_OP_CORE(op) \ - IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cpu, CPU) \ - IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cuda, CUDA) - -IMPLEMENT_UNARY_OP_VEC_CUDA(lgamma) +Tensor& lgamma_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, lgamma_stub); } +Tensor lgamma(const Tensor& self) { return unary_op_impl_float(self, lgamma_stub); } +Tensor& lgamma_(Tensor& self) { return unary_op_impl_(self, at::lgamma_out); } DEFINE_DISPATCH(abs_stub); DEFINE_DISPATCH(angle_stub); diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu index 97dbeefccc77..cdcf92e719d8 100644 --- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu @@ -41,7 +41,7 @@ void polygamma_kernel_cuda(TensorIterator& iter, int64_t n) { } void lgamma_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "lgamma_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "lgamma_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { return ::lgamma(a); }); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 906dc08eddd5..1856b9a9bf13 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -5148,12 +5148,6 @@ dispatch: CPU, CUDA: __irshift__ -- func: lgamma_(Tensor(a!) self) -> Tensor(a!) - variants: method - dispatch: - CPU: _lgamma__cpu - CUDA: _lgamma__cuda - - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method dispatch: @@ -5979,8 +5973,12 @@ - func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures dispatch: - CPU: _lgamma_out_cpu - CUDA: _lgamma_out_cuda + CPU, CUDA: lgamma_out + +- func: lgamma_(Tensor(a!) self) -> Tensor(a!) + variants: method + dispatch: + CPU, CUDA: lgamma_ - func: lgamma(Tensor self) -> Tensor variants: method, function diff --git a/test/test_torch.py b/test/test_torch.py index 55f5ee73c187..f027db3b9ef6 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6848,7 +6848,6 @@ def inner(self, device, dtype): ('round', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]), ('trunc', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]), ('ceil', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]), - ('lgamma', '', _small_3d, lambda t, d: [], 1e-2, 1e-1, 1e-5, _float_types_no_half, [torch.bfloat16]), ] # Creates and decorates a generic test and adds it to the class. diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 365c33179206..3497ccd04cc1 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -1684,8 +1684,6 @@ def _medium_2d(dtype, device): _TorchMathTestMeta('frac', reffn='fmod', refargs=lambda x: (x.numpy(), 1)), _TorchMathTestMeta('trunc'), _TorchMathTestMeta('round'), - # FIXME lgamma produces different result compared to scipy at -inf - _TorchMathTestMeta('lgamma', reffn='gammaln', ref_backend='scipy', replace_inf_with_nan=True), _TorchMathTestMeta('polygamma', args=[0], substr='_0', reffn='polygamma', refargs=lambda x: (0, x.numpy()), input_fn=_generate_gamma_input, inputargs=[False], ref_backend='scipy'), diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 910de06b2693..5a727cc5a92e 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1350,8 +1350,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { } break; case aten::lgamma: { - return computeOneOperand( - "aten_lgamma", v, [](const ExprHandle& a) { return lgamma(a); }); + return computeOneOperand("aten_lgamma", v, [](const ExprHandle& a) { + return lgamma(promoteIntegerToDefaultType(a)); + }); } break; case prim::ConstantChunk: { diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 8be600914d97..2ad65a9631dc 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1774,6 +1774,29 @@ def reference_sigmoid(x): return (1 / (1 + np.exp(-x))) return scipy.special.expit(x) + def reference_lgamma(x): + # scipy.special.gammaln returns `-inf` when input is `-inf`. + # While Pytorch, C and C++, all return `inf` when input is `-inf`. + # Reference: + # https://en.cppreference.com/w/cpp/numeric/math/lgamma + # https://en.cppreference.com/w/c/numeric/math/lgamma + + # To handle the above discrepancy, + # we replace -inf with inf so values + # that were originally -inf map to inf as expected + if x.dtype.kind == 'f': + x = np.where(x == float('-inf'), np.array(float('inf'), dtype=x.dtype), x) + + out = scipy.special.gammaln(x) + + if x.dtype == np.float16: + # `scipy.special.gammaln` returns output of float32 when input is float16, + # while `torch.lgamma` preserves `float16`. But due to smaller range of float16, + # Pytorch version outputs `inf` while SciPy returns finite values. + out = out.astype(np.float16) + + return out + op_db_scipy_reference: List[OpInfo] = [ UnaryUfuncInfo('sigmoid', ref=reference_sigmoid, @@ -1851,6 +1874,27 @@ def reference_sigmoid(x): dtypes=[torch.bfloat16]), ) ), + UnaryUfuncInfo('lgamma', + ref=reference_lgamma, + decorators=(precisionOverride({torch.float16: 7e-1}),), + dtypes=all_types_and(torch.bool), + dtypesIfCPU=all_types_and(torch.bool, torch.bfloat16), + dtypesIfCUDA=all_types_and(torch.bool, torch.half), + skips=( + # Reference: https://github.com/pytorch/pytorch/pull/50140#discussion_r552615345 + SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + dtypes=[torch.bfloat16]), + # Reference: https://github.com/pytorch/pytorch/pull/50140#issuecomment-756150214 + SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS), + # Backward of `lgamma` uses `digamma` but `digamma` + # is not implemented for `BFloat16` + # Error Raised: + # RuntimeError: "digamma" not implemented for 'BFloat16' + SkipInfo('TestCommon', 'test_variant_consistency_jit', + dtypes=[torch.bfloat16]), + ), + safe_casts_outputs=True), OpInfo('xlogy', dtypes=all_types_and(torch.bool), dtypesIfCPU=all_types_and(torch.bool, torch.half, torch.bfloat16), From 40eea6d9d1c46fcdf0eeb560cae808359a37229a Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Wed, 27 Jan 2021 12:58:30 -0800 Subject: [PATCH 12/41] Support device map for distributed autograd while using TensorPipe. (#44859) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44859 TensorPipe's `set_device_map` option was applied during the forward pass. However, if we ran the backward pass for the graph we would not automatically pick up the reverse device mapping. As a result, users had to specify both forward and backward device mapping which is very tedious to do. In this PR, I've added this functionality such that TensorPipe automatically picks up the reverse device mapping during the backward pass. This is done by storing the appropriate device mapping in the "recv" autograd function for distributed autograd. #Closes: https://github.com/pytorch/pytorch/issues/44170 ghstack-source-id: 119950842 Test Plan: 1) waitforbuildbot 2) Unit test added. Reviewed By: mrshenli Differential Revision: D23751975 fbshipit-source-id: 2717d0ef5bde3db029a6172d98aad95734d52140 --- test/cpp/rpc/e2e_test_base.h | 8 ++ .../autograd/functions/recvrpc_backward.cpp | 10 +- .../autograd/functions/recvrpc_backward.h | 6 +- .../rpc_messages/rpc_with_autograd.cpp | 38 ++++++-- .../autograd/rpc_messages/rpc_with_autograd.h | 12 ++- torch/csrc/distributed/autograd/utils.cpp | 14 ++- torch/csrc/distributed/autograd/utils.h | 7 +- .../distributed/rpc/process_group_agent.cpp | 3 +- .../distributed/rpc/process_group_agent.h | 4 +- .../rpc/request_callback_no_python.cpp | 11 ++- torch/csrc/distributed/rpc/rpc_agent.cpp | 6 ++ torch/csrc/distributed/rpc/rpc_agent.h | 8 +- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 93 +++++++++++++------ torch/csrc/distributed/rpc/tensorpipe_agent.h | 11 ++- .../csrc/distributed/rpc/tensorpipe_utils.cpp | 3 +- .../testing/faulty_process_group_agent.cpp | 3 +- .../rpc/testing/faulty_process_group_agent.h | 5 +- torch/csrc/distributed/rpc/utils.cpp | 10 +- .../distributed/rpc/dist_autograd_test.py | 31 +++++++ .../_internal/distributed/rpc_utils.py | 4 +- 20 files changed, 224 insertions(+), 63 deletions(-) diff --git a/test/cpp/rpc/e2e_test_base.h b/test/cpp/rpc/e2e_test_base.h index cea5079b1a4e..6526f8795c19 100644 --- a/test/cpp/rpc/e2e_test_base.h +++ b/test/cpp/rpc/e2e_test_base.h @@ -40,6 +40,14 @@ class TestE2EBase : public ::testing::Test { RpcAgent::setCurrentRpcAgent(rpcAgent); std::shared_ptr typeResolver = std::make_shared([&](const c10::QualifiedName& qn) { + // For Dict that is used for device map. + auto pos = qn.name().find("Dict"); + if (pos != std::string::npos) { + return c10::StrongTypePtr( + nullptr, + c10::DictType::create( + c10::IntType::create(), c10::IntType::create())); + } return c10::StrongTypePtr( nullptr, c10::TensorType::create(at::Tensor())); }); diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp index 509c5c6cbd08..33df6b540000 100644 --- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp +++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp @@ -13,10 +13,12 @@ using torch::autograd::variable_list; RecvRpcBackward::RecvRpcBackward( const AutogradMetadata& autogradMetadata, ContextPtr autogradContext, - rpc::worker_id_t fromWorkerId) + rpc::worker_id_t fromWorkerId, + std::unordered_map deviceMap) : autogradMetadata_(autogradMetadata), autogradContext_(std::move(autogradContext)), - fromWorkerId_(fromWorkerId) {} + fromWorkerId_(fromWorkerId), + deviceMap_(std::move(deviceMap)) {} variable_list RecvRpcBackward::apply(variable_list&& grads) { std::vector outputGrads; @@ -49,7 +51,9 @@ variable_list RecvRpcBackward::apply(variable_list&& grads) { auto rpcAgent = rpc::RpcAgent::getCurrentRpcAgent(); auto jitFuture = rpcAgent->send( rpcAgent->getWorkerInfo(fromWorkerId_), - std::move(gradCall).toMessage()); + std::move(gradCall).toMessage(), + rpc::kUnsetRpcTimeout, + deviceMap_); // Record the future in the context. sharedContext->addOutstandingRpc(jitFuture); diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h index 982e0331c102..69be98c928ef 100644 --- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h +++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h @@ -22,7 +22,8 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node { explicit RecvRpcBackward( const AutogradMetadata& autogradMetadata, std::shared_ptr autogradContext, - rpc::worker_id_t fromWorkerId); + rpc::worker_id_t fromWorkerId, + std::unordered_map deviceMap); torch::autograd::variable_list apply( torch::autograd::variable_list&& grads) override; @@ -38,6 +39,9 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node { // The worker id from which the RPC was received. During the backward pass, // we need to propagate the gradients to this workerId. rpc::worker_id_t fromWorkerId_; + + // Device mapping for tensors sent over RPC. + const std::unordered_map deviceMap_; }; } // namespace autograd diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp index 7389868d90c2..5aea96fa0c8b 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp @@ -18,11 +18,13 @@ RpcWithAutograd::RpcWithAutograd( worker_id_t fromWorkerId, MessageType messageType, const AutogradMetadata& autogradMetadata, - rpc::Message&& wrappedMessage) + rpc::Message&& wrappedMessage, + std::unordered_map deviceMap) : fromWorkerId_(fromWorkerId), messageType_(messageType), autogradMetadata_(autogradMetadata), - wrappedMessage_(std::move(wrappedMessage)) { + wrappedMessage_(std::move(wrappedMessage)), + deviceMap_(std::move(deviceMap)) { TORCH_INTERNAL_ASSERT( messageType_ == MessageType::FORWARD_AUTOGRAD_REQ || messageType_ == MessageType::FORWARD_AUTOGRAD_RESP); @@ -36,13 +38,15 @@ RpcWithAutograd::RpcWithAutograd( const AutogradMetadata& autogradMetadata, std::unique_ptr wrappedRpc, MessageType wrappedMessageType, - std::vector tensors) + std::vector tensors, + std::unordered_map deviceMap) : fromWorkerId_(fromWorkerId), messageType_(messageType), autogradMetadata_(autogradMetadata), wrappedRpc_(std::move(wrappedRpc)), wrappedMessageType_(wrappedMessageType), - tensors_(std::move(tensors)) { + tensors_(std::move(tensors)), + deviceMap_(std::move(deviceMap)) { TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cannot be null!"); TORCH_INTERNAL_ASSERT( messageType_ == MessageType::FORWARD_AUTOGRAD_REQ || @@ -56,10 +60,17 @@ Message RpcWithAutograd::toMessageImpl() && { auto payload = std::move(wrappedMessage_).movePayload(); TORCH_INTERNAL_ASSERT(!payload.empty()); + // Convert deviceMap to c10::Dict for serialization. + c10::Dict deviceMap; + for (const auto& mapEntry : deviceMap_) { + deviceMap.insert(mapEntry.first, mapEntry.second); + } + std::vector ivalues{wrappedMessageType, autogradMetadata_.autogradContextId, autogradMetadata_.autogradMessageId, - fromWorkerId_}; + fromWorkerId_, + deviceMap}; // Now pickle using JIT pickler. std::vector tensorTable; @@ -92,12 +103,19 @@ std::unique_ptr RpcWithAutograd::fromMessage( auto tupleElements = rpc::readWrappedPayload(payload, message); // Gather all the fields. - TORCH_INTERNAL_ASSERT(tupleElements.size() == 4); + TORCH_INTERNAL_ASSERT(tupleElements.size() == 5); MessageType wrappedMessageType = static_cast(tupleElements[0].toInt()); AutogradMetadata autogradMetadata( tupleElements[1].toInt(), tupleElements[2].toInt()); worker_id_t workerId = tupleElements[3].toInt(); + auto c10DeviceMap = tupleElements[4].to>(); + + // Convert to regular map. + std::unordered_map deviceMap; + for (const auto& mapEntry : c10DeviceMap) { + deviceMap.insert({mapEntry.key(), mapEntry.value()}); + } // Create new message type and build wrapped RPC. Message wrappedMessage( @@ -116,7 +134,8 @@ std::unique_ptr RpcWithAutograd::fromMessage( autogradMetadata, std::move(wrappedRpc), wrappedMessageType, - wrappedMessage.tensors()); + wrappedMessage.tensors(), + deviceMap); } std::vector& RpcWithAutograd::tensors() { @@ -150,6 +169,11 @@ rpc::worker_id_t RpcWithAutograd::fromWorkerId() const { return fromWorkerId_; } +const std::unordered_map& RpcWithAutograd:: + deviceMap() { + return deviceMap_; +} + } // namespace autograd } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h index 657d2cf2641f..f4728ea37c63 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h @@ -18,7 +18,8 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { rpc::worker_id_t fromWorkerId, rpc::MessageType messageType, const AutogradMetadata& autogradMetadata, - rpc::Message&& wrappedMessage); + rpc::Message&& wrappedMessage, + std::unordered_map deviceMap = {}); // Used when receiving an RPC over the wire. RpcWithAutograd( @@ -27,7 +28,8 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { const AutogradMetadata& autogradMetadata, std::unique_ptr wrappedRpc, rpc::MessageType wrappedMessageType, - std::vector tensors); + std::vector tensors, + std::unordered_map deviceMap = {}); rpc::Message toMessageImpl() && override; @@ -52,6 +54,9 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { // Retrieve the worker id from which the RPC originated. rpc::worker_id_t fromWorkerId() const; + // Retrieve the device map. + const std::unordered_map& deviceMap(); + private: // WorkerId from which this RPC originated. This is necessary for knowing // which worker we need to contact during the backward pass. @@ -83,6 +88,9 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { // Tensors part of the wrappedRpc that need to be considered for autograd. std::vector tensors_; + + // Device mapping for tensors that are sent across an RPC to another node. + std::unordered_map deviceMap_; }; } // namespace autograd diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp index 08bb99471686..747a958948a4 100644 --- a/torch/csrc/distributed/autograd/utils.cpp +++ b/torch/csrc/distributed/autograd/utils.cpp @@ -52,7 +52,8 @@ void addSendRpcBackward( ContextPtr addRecvRpcBackward( const AutogradMetadata& autogradMetadata, std::vector& tensors, - rpc::worker_id_t fromWorkerId) { + rpc::worker_id_t fromWorkerId, + const std::unordered_map& deviceMap) { // Initialize autograd context if necessary. auto& autogradContainer = DistAutogradContainer::getInstance(); auto autogradContext = @@ -61,7 +62,7 @@ ContextPtr addRecvRpcBackward( if (!tensors.empty() && torch::autograd::compute_requires_grad(tensors)) { // Attach the tensors as inputs to the autograd function. auto grad_fn = std::make_shared( - autogradMetadata, autogradContext, fromWorkerId); + autogradMetadata, autogradContext, fromWorkerId, deviceMap); for (auto& tensor : tensors) { if (tensor.requires_grad()) { torch::autograd::set_history(tensor, grad_fn); @@ -102,7 +103,8 @@ Message getMessageWithAutograd( const rpc::worker_id_t dstId, torch::distributed::rpc::Message&& wrappedRpcMsg, MessageType msgType, - bool forceGradRecording) { + bool forceGradRecording, + const std::unordered_map& deviceMap) { auto& autogradContainer = DistAutogradContainer::getInstance(); // If there is no valid context and no tensor requires grads, send original @@ -125,7 +127,8 @@ Message getMessageWithAutograd( RpcAgent::getCurrentRpcAgent()->getWorkerInfo().id_, msgType, autogradMetadata, - std::move(wrappedRpcMsg)); + std::move(wrappedRpcMsg), + deviceMap); if (tensorsRequireGrad) { // Record autograd information for 'send'. @@ -149,7 +152,8 @@ std::shared_ptr sendMessageWithAutograd( dst.id_, std::move(wrappedRpcMsg), MessageType::FORWARD_AUTOGRAD_REQ, - forceGradRecording); + forceGradRecording, + agent.getDeviceMap(dst)); std::shared_ptr fut; // If profiler is enabled, wrap this message with profiling metadata that will diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h index 07ba45ed60d7..013558252fc2 100644 --- a/torch/csrc/distributed/autograd/utils.h +++ b/torch/csrc/distributed/autograd/utils.h @@ -30,7 +30,8 @@ TORCH_API void addSendRpcBackward( TORCH_API ContextPtr addRecvRpcBackward( const AutogradMetadata& autogradMetadata, std::vector& tensors, - rpc::worker_id_t fromWorkerId); + rpc::worker_id_t fromWorkerId, + const std::unordered_map& deviceMap); // This method is a wrapper utility used internally to wrap autograd info // and attach autograd function for each type of rpc call if it has valid @@ -42,7 +43,9 @@ TORCH_API rpc::Message getMessageWithAutograd( const rpc::worker_id_t dstId, rpc::Message&& wrappedRpcMsg, rpc::MessageType msgType, - bool forceGradRecording = false); + bool forceGradRecording = false, + const std::unordered_map& deviceMap = + {}); // Send message after autograd checking TORCH_API std::shared_ptr diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp index 9c1a703cfa6d..3cd940f3ee49 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/process_group_agent.cpp @@ -290,7 +290,8 @@ void ProcessGroupAgent::shutdownImpl() { std::shared_ptr ProcessGroupAgent::send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds) { + const float rpcTimeoutSeconds, + const std::unordered_map& deviceMap) { // Throw if we previously encountered an exception in ::listenLoop. { std::unique_lock guard(listenLoopExceptionMutex_); diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h index 8d2471a7d113..d1d957a66562 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.h +++ b/torch/csrc/distributed/rpc/process_group_agent.h @@ -91,7 +91,9 @@ class TORCH_API ProcessGroupAgent : public RpcAgent { std::shared_ptr send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds = kUnsetRpcTimeout) override; + const float rpcTimeoutSeconds = kUnsetRpcTimeout, + const std::unordered_map& deviceMap = + {}) override; // put SendWork into a queue and notify the worker thread virtual void enqueueSend(SendWork work); diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp index 09c56dc960c9..46192d7eb317 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp +++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp @@ -345,11 +345,20 @@ void RequestCallbackNoPython::processForwardAutogradReq( const std::shared_ptr& responseFuture) const { auto& rpcWithAutograd = static_cast(rpc); + // Need to reverse the device map for the backward pass of distributed + // autograd. + std::unordered_map reverseDeviceMap; + for (const auto& mapEntry : rpcWithAutograd.deviceMap()) { + reverseDeviceMap.insert({mapEntry.second, mapEntry.first}); + } + + // Attach 'recv' autograd function. auto autogradContext = addRecvRpcBackward( rpcWithAutograd.autogradMetadata(), rpcWithAutograd.tensors(), - rpcWithAutograd.fromWorkerId()); + rpcWithAutograd.fromWorkerId(), + reverseDeviceMap); // For this recv thread on server side, before processRpc(), // set current_context_id_ to be context_id passed from client. // In this way, if there is nested rpc call in python rpc call, original diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp index 2033b2b771e2..5c9570bcac1d 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.cpp +++ b/torch/csrc/distributed/rpc/rpc_agent.cpp @@ -286,6 +286,12 @@ bool RpcAgent::isGILProfilingEnabled() { return profilingEnabled_.load(); } +std::unordered_map RpcAgent::getDeviceMap( + const WorkerInfo& dest) { + // Default implementation has no device map. + return {}; +} + std::unordered_map RpcAgent::getDebugInfo() { /* This would later include more info other than metrics for eg: may include stack traces for the threads owned by the agent */ diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index bfc6c38c07a1..956af3da899b 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -160,7 +160,9 @@ class TORCH_API RpcAgent { virtual std::shared_ptr send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds = kUnsetRpcTimeout) = 0; + const float rpcTimeoutSeconds = kUnsetRpcTimeout, + const std::unordered_map& deviceMap = + {}) = 0; // Retries sending the message up to maxRetries times until an ACK is // receieved. The duration between consecutive sends is increased over @@ -259,6 +261,10 @@ class TORCH_API RpcAgent { // Get the type resolver std::shared_ptr getTypeResolver(); + // Retrieves the device map for the provided destination worker. + virtual std::unordered_map getDeviceMap( + const WorkerInfo& dest); + protected: const WorkerInfo workerInfo_; const std::unique_ptr cb_; diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 0417577d2499..518fc72e8304 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -39,6 +39,42 @@ const std::string kClientActiveCalls = "agent.client_active_calls"; const std::string kServerActiveCalls = "agent.server_active_calls"; const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls"; +std::vector getDevicesForTensors( + const std::vector& tensors, + const tensorpipe::DeviceMap& deviceMap, + const std::string& remoteName) { + // If the deviceMap is overridden, use that instead. + const auto errStr = c10::str( + "TensorPipe RPC backend only supports CPU tensors by default, please " + "move your tensors to CPU before sending them over RPC, or call " + "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly " + "configure device mapping. ", + "Request device mapping is not available for destination ", + remoteName); + std::vector deviceIndices; + deviceIndices.reserve(tensors.size()); + bool hasCudaTensor = false; + for (const auto& t : tensors) { + if (t.device().is_cpu()) { + deviceIndices.push_back(-1); + } else { + const auto deviceIter = deviceMap.find(t.device().index()); + TORCH_CHECK( + deviceIter != deviceMap.end(), + errStr, + " for device ", + t.device(), + " but received a tensor on that device."); + deviceIndices.push_back(deviceIter->second); + hasCudaTensor = true; + } + } + if (!hasCudaTensor) { + deviceIndices.clear(); + } + return deviceIndices; +} + } // namespace // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) @@ -547,7 +583,9 @@ void TensorPipeAgent::pipeWrite( Message&& rpcMessage, std::vector&& devices, std::shared_ptr ctx, - std::function fn) noexcept { + std::function fn, + const std::unordered_map& + deviceMap) noexcept { tensorpipe::Message tpMessage; TensorpipeWriteBuffers tpBuffers; @@ -587,7 +625,7 @@ void TensorPipeAgent::sendCompletedResponseMessage( responseMessage.setId(messageId); std::vector devices; try { - devices = getDevicesForTensors(pipe->getRemoteName(), responseMessage); + devices = getDevicesForRemote(pipe->getRemoteName(), responseMessage); } catch (const std::exception& e) { responseMessage = createExceptionResponse(e.what(), messageId); } @@ -721,7 +759,8 @@ void TensorPipeAgent::respond(std::shared_ptr& pipe) { std::shared_ptr TensorPipeAgent::send( const WorkerInfo& toWorkerInfo, Message&& requestMessage, - const float rpcTimeoutSeconds) { + const float rpcTimeoutSeconds, + const std::unordered_map& deviceMap) { TORCH_CHECK( requestMessage.isRequest(), "TensorPipeAgent::send(..) is only for sending requests."); @@ -761,8 +800,15 @@ std::shared_ptr TensorPipeAgent::send( // Get devices for tensors in the request message. This can throw if device // maps are not configured properly for this request. - auto devices = - getDevicesForTensors(clientPipe.pipe_->getRemoteName(), requestMessage); + std::vector devices; + if (deviceMap.empty()) { + devices = + getDevicesForRemote(clientPipe.pipe_->getRemoteName(), requestMessage); + } else { + // If deviceMap is specified, use that instead. + devices = getDevicesForTensors( + requestMessage.tensors(), deviceMap, clientPipe.pipe_->getRemoteName()); + } futureResponseMessage->jitFuture->addCallback([this]() { TORCH_INTERNAL_ASSERT( @@ -906,7 +952,8 @@ std::shared_ptr TensorPipeAgent::send( std::move(ctx)); } }); - }); + }, + deviceMap); return futureResponseMessage->jitFuture; } @@ -1196,7 +1243,7 @@ void TensorPipeAgent::markFutureWithError( } } -std::vector TensorPipeAgent::getDevicesForTensors( +std::vector TensorPipeAgent::getDevicesForRemote( const std::string& remoteName, const Message& message) const { const auto& deviceMaps = @@ -1222,30 +1269,16 @@ std::vector TensorPipeAgent::getDevicesForTensors( } return {}; } else { - std::vector deviceIndices; - deviceIndices.reserve(message.tensors().size()); - const auto& deviceMap = iter->second; - bool hasCudaTensor = false; - for (const auto& t : message.tensors()) { - if (t.device().is_cpu()) { - deviceIndices.push_back(-1); - } else { - const auto deviceIter = deviceMap.find(t.device().index()); - TORCH_CHECK( - deviceIter != deviceMap.end(), - errStr, - " for device ", - t.device(), - " but received a tensor on that device."); - deviceIndices.push_back(deviceIter->second); - hasCudaTensor = true; - } - } - if (!hasCudaTensor) { - deviceIndices.clear(); - } - return deviceIndices; + return getDevicesForTensors(message.tensors(), iter->second, errStr); + } +} + +tensorpipe::DeviceMap TensorPipeAgent::getDeviceMap(const WorkerInfo& dest) { + auto it = opts_.deviceMaps.find(dest.name_); + if (it == opts_.deviceMaps.end()) { + return {}; } + return it->second; } size_t TensorPipeAgent::timeoutMapSize() { diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h index b4d8796aeede..078750385538 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h @@ -185,7 +185,9 @@ class TensorPipeAgent : public RpcAgent { std::shared_ptr send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds = kUnsetRpcTimeout) override; + const float rpcTimeoutSeconds = kUnsetRpcTimeout, + const std::unordered_map& deviceMap = + {}) override; // join() and sync() would be deprecated - // https://github.com/pytorch/pytorch/issues/27647 @@ -209,6 +211,8 @@ class TensorPipeAgent : public RpcAgent { void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override; + tensorpipe::DeviceMap getDeviceMap(const WorkerInfo& dest) override; + using NetworkDataDict = std::unordered_map; @@ -252,7 +256,8 @@ class TensorPipeAgent : public RpcAgent { Message&& message, std::vector&& devices, std::shared_ptr ctx, - std::function) noexcept; + std::function, + const tensorpipe::DeviceMap& deviceMap = {}) noexcept; // Callback of listener accept() void onListenerAccepted( @@ -279,7 +284,7 @@ class TensorPipeAgent : public RpcAgent { uint64_t requestSize, const std::string& destWorkerName); - inline std::vector getDevicesForTensors( + inline std::vector getDevicesForRemote( const std::string& remoteName, const Message& message) const; diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp index 1d17e4451372..9757e0971d2b 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp @@ -91,7 +91,8 @@ std::tuple tensorpipeSerialize( // Enforce memory copy if tensor is created from torch::from_blob, means // that the tensor doesn't own the memory. std::string metadata = - deviceIndices.empty() ? "" : std::to_string(deviceIndices[i]); + deviceIndices.empty() || deviceIndices[i] == -1 + ? "" : std::to_string(deviceIndices[i]); if (!tensorData.storageHasDeleter()) { std::vector storageData( diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp index 57dbef3a549b..870f9702ee0e 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp @@ -59,7 +59,8 @@ std::unordered_map> FaultyProcessGroupAgent:: std::shared_ptr FaultyProcessGroupAgent::send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds) { + const float rpcTimeoutSeconds, + const std::unordered_map& deviceMap) { // We only fail control messages that have been specified by the test case. // For all other messages, we just send them without any failures. if (!shouldFailMessage(message.type())) { diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h index 8cbe4c9a137d..ce8fee558274 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h +++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h @@ -46,8 +46,9 @@ class FaultyProcessGroupAgent : public ProcessGroupAgent { std::shared_ptr send( const WorkerInfo& to, Message&& message, - const float rpcTimeoutSeconds = - torch::distributed::rpc::kUnsetRpcTimeout) override; + const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout, + const std::unordered_map& deviceMap = + {}) override; protected: // This function checks the messageTypesToFail_ to determine whether to use diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp index 0f137a72a252..d643ab87b8ea 100644 --- a/torch/csrc/distributed/rpc/utils.cpp +++ b/torch/csrc/distributed/rpc/utils.cpp @@ -174,11 +174,19 @@ std::unique_ptr deserializeResponse( RpcCommandBase& rpc = *rpcPtr; auto& rpcWithAutograd = static_cast(rpc); + // Need to reverse the device map for the backward pass of distributed + // autograd. + std::unordered_map reverseDeviceMap; + for (const auto& mapEntry : rpcWithAutograd.deviceMap()) { + reverseDeviceMap.insert({mapEntry.second, mapEntry.first}); + } + // Attach 'recv' autograd function. addRecvRpcBackward( rpcWithAutograd.autogradMetadata(), rpcWithAutograd.tensors(), - rpcWithAutograd.fromWorkerId()); + rpcWithAutograd.fromWorkerId(), + reverseDeviceMap); wrappedMsgType = rpcWithAutograd.wrappedMessageType(); diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index 15d5cfeca214..54e936bf0f0d 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -2235,3 +2235,34 @@ def test_verify_backend_options(self): self.assertEqual(self.rpc_backend_options.num_send_recv_threads, 8) self.assertEqual(self.rpc_backend_options.num_fail_sends, 3) self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4) + +class TensorPipeDistAutogradTest(RpcAgentTestFixture): + + @skip_if_lt_x_gpu(4) + def test_device_maps_backward_pass(self): + options = self.rpc_backend_options + dst = worker_name((self.rank + 1) % self.world_size) + + # The reverse of this device mapping should be used for the backward pass. + options.set_device_map(dst, {self.rank: (self.rank + 1) % self.world_size}) + + rpc.init_rpc( + name=worker_name(self.rank), + backend=self.rpc_backend, + rank=self.rank, + world_size=self.world_size, + rpc_backend_options=options, + ) + + t1 = torch.rand(10, device=self.rank, requires_grad=True) + t2 = torch.rand(10, device=self.rank, requires_grad=True) + with dist_autograd.context() as context_id: + res = rpc.rpc_sync(dst, torch.add, args=(t1, t2)) + dist_autograd.backward(context_id, [res.sum()]) + grads = dist_autograd.get_gradients(context_id) + self.assertEqual(torch.ones(10), grads[t1]) + self.assertEqual(torch.ones(10), grads[t2]) + self.assertEqual(t1.device, grads[t1].device) + self.assertEqual(t2.device, grads[t2].device) + + rpc.shutdown() diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py index d35f3da5d2c2..bdf4bbd6eb78 100644 --- a/torch/testing/_internal/distributed/rpc_utils.py +++ b/torch/testing/_internal/distributed/rpc_utils.py @@ -23,6 +23,7 @@ from torch.testing._internal.distributed.rpc.dist_autograd_test import ( DistAutogradTest, FaultyAgentDistAutogradTest, + TensorPipeDistAutogradTest ) from torch.testing._internal.distributed.rpc.dist_optimizer_test import ( DistOptimizerTest, @@ -139,7 +140,8 @@ class MultiProcess(Flag): # These suites should be standalone, and separate from the ones in the generic # list (not subclasses of those!). TENSORPIPE_TESTS = [ - TensorPipeAgentRpcTest + TensorPipeAgentRpcTest, + TensorPipeDistAutogradTest ] From f7e90cf31129163e31ee8bcbd37f54e7f9ddc39c Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Wed, 27 Jan 2021 13:23:46 -0800 Subject: [PATCH 13/41] Revert D26089965: [quant][graphmode][fx] Add support for functional conv1d and conv3d Test Plan: revert-hammer Differential Revision: D26089965 (https://github.com/pytorch/pytorch/commit/dd1a97b3ae8a14a5c167fd97273531d2b5ad566b) Original commit changeset: 4aea507d05b7 fbshipit-source-id: f54184cafb9dd07858683489d8bd147474e7e4b3 --- test/quantization/test_quantize_fx.py | 60 ++++++------------- .../quantization/fx/quantization_patterns.py | 20 ++----- torch/quantization/fx/utils.py | 26 -------- 3 files changed, 22 insertions(+), 84 deletions(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 295144bddc3f..b2243eead1d0 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -1598,21 +1598,14 @@ def forward(self, x): def test_functional_conv(self): """ Test for function conv and functional conv + relu """ - convs = { - 1: torch.nn.functional.conv1d, - 2: torch.nn.functional.conv2d, - 3: torch.nn.functional.conv3d, - } - class FuncConv(torch.nn.Module): - def __init__(self, dim, use_bias, has_relu, f_relu): + def __init__(self, use_bias, has_relu, f_relu): super().__init__() - self.dim = dim - self.w = torch.randn(tuple([3] * (dim + 2))) + self.w = torch.randn(3, 3, 3, 3) self.b = torch.randn(3) if use_bias else None - self.stride = tuple([1] * dim) - self.padding = tuple([0] * dim) - self.dilation = tuple([1] * dim) + self.stride = (1, 1) + self.padding = (0, 0) + self.dilation = (1, 1) self.groups = 1 self.use_bias = use_bias if has_relu: @@ -1624,10 +1617,12 @@ def __init__(self, dim, use_bias, has_relu, f_relu): self.relu = torch.nn.Identity() def forward(self, x): - x = convs[self.dim](x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups) + x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups) x = self.relu(x) return x + data = (torch.randn((2, 3, 4, 4), dtype=torch.float),) + quant_type_to_prepare_expected_node_occurrence = { QuantType.DYNAMIC: {}, # There should be 3 observers: after input, weight and activation. @@ -1641,50 +1636,31 @@ def forward(self, x): }, } quant_type_to_qconv_fun = { - QuantType.STATIC: { - 1: ns.call_function(torch.ops.quantized.conv1d), - 2: ns.call_function(torch.ops.quantized.conv2d), - 3: ns.call_function(torch.ops.quantized.conv3d) - }, - QuantType.QAT: { - 1: ns.call_function(torch.ops.quantized.conv1d), - 2: ns.call_function(torch.ops.quantized.conv2d), - 3: ns.call_function(torch.ops.quantized.conv3d) - }, + QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d), + QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d), } quant_type_to_qconv_relu_fun = { - QuantType.STATIC: { - 1: ns.call_function(torch.ops.quantized.conv1d_relu), - 2: ns.call_function(torch.ops.quantized.conv2d_relu), - 3: ns.call_function(torch.ops.quantized.conv3d_relu) - }, - QuantType.QAT: { - 1: ns.call_function(torch.ops.quantized.conv1d_relu), - 2: ns.call_function(torch.ops.quantized.conv2d_relu), - 3: ns.call_function(torch.ops.quantized.conv3d_relu) - }, + QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu), + QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu), } options = itertools.product( - [1, 2, 3], # dims self.static_quant_types, (True, False), # use_bias (True, False), # has_relu (True, False), # functional relu ) - for dim, quant_type, use_bias, has_relu, f_relu in options: - data_dims = [2, 3] + [4] * dim - data = (torch.randn(tuple(data_dims), dtype=torch.float),) - model = FuncConv(dim, use_bias, has_relu, f_relu) + for quant_type, use_bias, has_relu, f_relu in options: + model = FuncConv(use_bias, has_relu, f_relu) if has_relu: - qconv_fun = quant_type_to_qconv_relu_fun[quant_type][dim] + qconv_fun = quant_type_to_qconv_relu_fun[quant_type] else: - qconv_fun = quant_type_to_qconv_fun[quant_type][dim] + qconv_fun = quant_type_to_qconv_fun[quant_type] convert_node_occurrence = { - ns.call_function(torch.quantize_per_tensor): 1, + ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0, qconv_fun: 1, - ns.call_method("dequantize"): 1 + ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0 } prepare_expected_node_occurrence = \ quant_type_to_prepare_expected_node_occurrence[quant_type] diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 7d0e3f7d0a78..06f15240e761 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -32,8 +32,6 @@ quantize_node, get_per_tensor_qparams, get_linear_prepack_op_for_dtype, - get_qconv_prepack_op, - get_qconv_op, ) from .quantization_types import QuantizerCls @@ -190,10 +188,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, @register_quant_pattern(torch.nn.Conv1d) @register_quant_pattern(torch.nn.Conv2d) @register_quant_pattern(torch.nn.Conv3d) -@register_quant_pattern(torch.nn.functional.conv1d) @register_quant_pattern(torch.nn.functional.conv2d) -@register_quant_pattern(torch.nn.functional.conv3d) -# TODO: add qat.Conv1d and qat.Conv3d @register_quant_pattern(torch.nn.qat.Conv2d) @register_quant_pattern(torch.nn.intrinsic.ConvReLU1d) @register_quant_pattern(torch.nn.intrinsic.ConvReLU2d) @@ -203,12 +198,8 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d) @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d) @register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d)) @register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d)) @register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d)) # just for error checks @register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d)) @register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d)) @@ -221,10 +212,8 @@ def __init__(self, quantizer: QuantizerCls, node: Node): self.relu_node = node node = node.args[0] # type: ignore self.conv_node = node - if node.op == "call_module": + if node.op == 'call_module': self.conv = quantizer.modules[self.conv_node.target] - elif node.op == "call_function": - self.conv = node.target def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, debug: bool = False, @@ -286,7 +275,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, args = load_arg(quantized=False)(self.conv_node.args) kwargs = load_arg(quantized=False)(self.conv_node.kwargs) op_out = quantizer.quantized_graph.create_node( - "call_function", self.conv, args, kwargs) + "call_function", torch.nn.functional.conv2d, args, kwargs) if self.relu_node: relu_args = [op_out] relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:])) @@ -311,14 +300,13 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, weight = load_arg(quantized=True)(self.conv_node.args[1]) other_args = load_arg(quantized=False)(self.conv_node.args[2:]) prepack_args = tuple([weight] + list(other_args)) - prepack_op = get_qconv_prepack_op(self.conv) packed_weight = quantizer.quantized_graph.create_node( - "call_function", prepack_op, prepack_args, {}) + 'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {}) assert activation_statically_quantized, \ "currently only static quantization is supported for conv" # construct conv input if activation_statically_quantized: - qconv_op = get_qconv_op(self.conv, self.relu_node is not None) + qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d conv_input = load_arg(quantized=True)(self.conv_node.args[0]) act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name activation_post_process = quantizer.activation_post_process_map[act_post_process_name] diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 8d3445e46dba..8285e204b1ed 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -179,32 +179,6 @@ def get_linear_prepack_op_for_dtype(dtype): else: raise Exception("can't get linear prepack op for dtype:", dtype) -def get_qconv_prepack_op(conv_op: Callable) -> Callable: - prepack_ops = { - torch.nn.functional.conv1d: torch.ops.quantized.conv1d_prepack, - torch.nn.functional.conv2d: torch.ops.quantized.conv2d_prepack, - torch.nn.functional.conv3d: torch.ops.quantized.conv3d_prepack - } - prepack_op = prepack_ops.get(conv_op, None) - assert prepack_op, "Didn't find prepack op for {}".format(conv_op) - return prepack_op - -def get_qconv_op(conv_op: Callable, has_relu: bool) -> Callable: - qconv_op = { - # has relu - True: { - torch.nn.functional.conv1d: torch.ops.quantized.conv1d_relu, - torch.nn.functional.conv2d: torch.ops.quantized.conv2d_relu, - torch.nn.functional.conv3d: torch.ops.quantized.conv3d_relu - }, - False: { - torch.nn.functional.conv1d: torch.ops.quantized.conv1d, - torch.nn.functional.conv2d: torch.ops.quantized.conv2d, - torch.nn.functional.conv3d: torch.ops.quantized.conv3d - } - } - return qconv_op[has_relu].get(conv_op) - # Returns a function that can get a new attribute name for module with given # prefix, for example, # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer') From 1c8d11c9e2ff64d92fb322c573d287588c26a718 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 27 Jan 2021 13:43:09 -0800 Subject: [PATCH 14/41] [PyTorch] Save a refcount bump in make_variable (#51180) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51180 This fast path still did a refcount bump because it copied the inner intrusive_ptr to the stack. Now it's moved. ghstack-source-id: 120460258 Test Plan: 1) profile empty benchmark & inspect assembly to verify move 2) run framework overhead benchmarks Reviewed By: bhosmer Differential Revision: D26094951 fbshipit-source-id: b2e09f9ad885cb633402885ca1e61a370723f6b8 --- aten/src/ATen/templates/TensorBody.h | 4 ++++ torch/csrc/autograd/variable.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index 731cb79e031f..77b9ae978158 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -147,6 +147,10 @@ class TORCH_API Tensor { return impl_; } + c10::intrusive_ptr unsafeReleaseIntrusivePtr() { + return std::move(impl_); + } + bool defined() const { return impl_; } diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 83bbc9406081..ad8c1919dee6 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -666,7 +666,7 @@ inline Variable make_variable( bool allow_tensor_metadata_change = true) { if (data.defined()) { if (data.getIntrusivePtr().use_count() == 1 && data.getIntrusivePtr()->unique_version()) { - auto data_impl = data.getIntrusivePtr(); + auto data_impl = data.unsafeReleaseIntrusivePtr(); data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); if (requires_grad) { data_impl->set_autograd_meta(std::make_unique(data_impl.get(), requires_grad)); From eaf5ca09dc9939f15230638cdba0af0ac46eb2d8 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 27 Jan 2021 13:58:43 -0800 Subject: [PATCH 15/41] Migrate masked_scatter_ CUDA to ATen (#50039) Summary: Fixes https://github.com/pytorch/pytorch/issues/49542 Pull Request resolved: https://github.com/pytorch/pytorch/pull/50039 Reviewed By: heitorschueroff Differential Revision: D26096247 Pulled By: ngimel fbshipit-source-id: ec1810d3412e0d7ab6b950265a3123519ad886c1 --- aten/src/ATen/LegacyTHFunctionsCUDA.h | 2 - aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 160 ------------------ aten/src/ATen/native/cuda/IndexKernel.cu | 103 +++++++++++ .../ATen/native/cuda/LegacyDefinitions.cpp | 15 -- aten/src/THC/THCTensorMasked.cuh | 16 -- aten/src/THC/generic/THCTensorMasked.cu | 141 --------------- aten/src/THC/generic/THCTensorMasked.h | 19 --- test/test_torch.py | 93 +++++----- 8 files changed, 153 insertions(+), 396 deletions(-) diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h index a9004a5b6d01..069a2c1152c4 100644 --- a/aten/src/ATen/LegacyTHFunctionsCUDA.h +++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h @@ -20,8 +20,6 @@ namespace cuda { Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value); Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value); -Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source); -Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source); Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source); Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index); Tensor _th_take(const Tensor & self, const Tensor & index); diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index ddae7965dded..b60968a4b041 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -200,166 +200,6 @@ Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value) } return self; } -Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaBoolTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaBFloat16Tensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_); - break; - } - default: - AT_ERROR("_th_masked_scatter_ not supported on CUDAType for ", dispatch_scalar_type); - } - return self; -} -Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaBoolTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool); - auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaBFloat16Tensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_); - break; - } - default: - AT_ERROR("_th_masked_scatter_bool_ not supported on CUDAType for ", dispatch_scalar_type); - } - return self; -} Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index d88f202487af..091e1ec22a19 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -10,7 +10,13 @@ #include #include #include +#include #include +#include + +#include +#include +#include namespace at { namespace native { @@ -252,6 +258,103 @@ Tensor& take_out_cuda(Tensor& out, const Tensor& self, const Tensor& index) { return out; } +namespace { + +template +void masked_scatter_cuda_impl(Tensor& self, const Tensor& mask, const Tensor& source){ + auto srcSize = source.numel(); + + // Determine our output size + auto totalElements = mask.sum().item(); + + // The number of `1` elements present in the mask must be <= the + // number of elements available in `src` + TORCH_CHECK(totalElements <= srcSize, "source nElements must be == mask `1` elements"); + + auto mask_cont = mask.contiguous(); + + // Use a prefix sum to determine the output locations of the masked elements + auto maskPrefixSum = at::empty_like(mask, mask.options().dtype(kLong)); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + + thrust::device_ptr maskData(mask_cont.data_ptr()); + thrust::device_ptr maskPrefixSumData( + maskPrefixSum.data_ptr()); + + thrust::exclusive_scan( + thrust::cuda::par(allocator).on(c10::cuda::getCurrentCUDAStream()), + maskData, + maskData + mask_cont.numel(), + maskPrefixSumData); + + // We are getting elements from `src` based on an offset from + // `maskPrefixSum`, so that should be made contiguous too + auto source_contig = source.contiguous(); + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(self) + .add_input(self) + .add_input(mask_cont) + .add_input(maskPrefixSum) + .build(); + + AT_DISPATCH_ALL_TYPES_AND3( + ScalarType::Bool, + ScalarType::BFloat16, + ScalarType::Half, + self.scalar_type(), + "masked_scatter_", + [&]() { + auto source_ptr = source_contig.data_ptr(); + gpu_kernel( + iter, [=] GPU_LAMBDA(scalar_t a, mask_t mask, int64_t maskPrefixSum) -> scalar_t { + if (mask) { + return source_ptr[maskPrefixSum]; + } + return a; + }); + cudaGetLastError(); + }); +} + +} // anonymous namespace + +Tensor & masked_scatter__cuda(Tensor& self, const Tensor& mask, const Tensor& source) { + at::assert_no_internal_overlap(self); + TORCH_CHECK( + self.scalar_type() == source.scalar_type(), + "masked_scatter: expected self and source to have same dtypes but got", + self.scalar_type(), + " and ", + source.scalar_type()); + + TensorArg self_arg{self, "self", 1}; + TensorArg mask_arg{mask, "mask", 2}; + TensorArg source_arg{source, "source", 3}; + checkAllSameGPU("masked_scatter_", {self_arg, mask_arg, source_arg}); + + Tensor b_mask; + std::tie(b_mask) = expand_inplace(self, mask, "masked_scatter_"); + + if (b_mask.dtype() == ScalarType::Byte) { + TORCH_WARN("masked_scatter_ received a mask with dtype torch.uint8, this behavior is now deprecated," \ + "please use a mask with dtype torch.bool instead."); + } + + auto mask_dtype = b_mask.scalar_type(); + if (mask_dtype == ScalarType::Bool) { + masked_scatter_cuda_impl(self, b_mask, source); + } else { + masked_scatter_cuda_impl(self, b_mask, source); + } + + return self; +} + REGISTER_DISPATCH(index_stub, &index_kernel); REGISTER_DISPATCH(index_put_stub, &index_put_kernel); diff --git a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp index 1bbe47dbfb2e..735f2c8b2875 100644 --- a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp +++ b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp @@ -61,19 +61,4 @@ Tensor & masked_fill__cuda(Tensor& self, const Tensor & mask, const Tensor & val return self; } -Tensor & masked_scatter__cuda(Tensor& self, const Tensor & mask, const Tensor & source) { - at::assert_no_internal_overlap(self); - Tensor b_mask; - std::tie(b_mask) = expand_inplace(self, mask, "masked_scatter_"); - // As we dispatch on self and TH is type-checked, we need different definitions. - // This can be fixed by moving to ATen. - if (b_mask.dtype() == at::ScalarType::Byte) { - TORCH_WARN("masked_scatter_ received a mask with dtype torch.uint8, this behavior is now deprecated," \ - "please use a mask with dtype torch.bool instead."); - return legacy::cuda::_th_masked_scatter_(self, b_mask, source); - } else { - return legacy::cuda::_th_masked_scatter_bool_(self, b_mask, source); - } -} - }} // namespace at::native diff --git a/aten/src/THC/THCTensorMasked.cuh b/aten/src/THC/THCTensorMasked.cuh index 88f2d78b698d..4e696ba392ce 100644 --- a/aten/src/THC/THCTensorMasked.cuh +++ b/aten/src/THC/THCTensorMasked.cuh @@ -25,22 +25,6 @@ struct TensorMaskedFillOp { T value; }; -template -struct TensorMaskedCopyOp { - TensorMaskedCopyOp(T* s) : in(s) {} - - __device__ inline void operator()(T* out, - MaskT* mask, - MaskPrefixSumT* maskPrefixSum) { - if (*mask) { - *out = in[*maskPrefixSum]; - } - } - - // Where we are copying from - T* in; -}; - template struct TensorMaskedSelectOp { TensorMaskedSelectOp(T* t) : out(t) {} diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu index 4e93ac260e42..4a3c93241aec 100644 --- a/aten/src/THC/generic/THCTensorMasked.cu +++ b/aten/src/THC/generic/THCTensorMasked.cu @@ -47,145 +47,4 @@ void THCTensor_(maskedFillByte)(THCState* state, THCudaByteTensor_free(state, maskCuda); } -void THCTensor_(maskedCopy)(THCState* state, - THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); - ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask); - ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor); - ptrdiff_t srcSize = THCTensor_(nElement)(state, src); - - // `mask` and `tensor` must have the same number of elements - THArgCheck(maskSize == tensorSize, 2, - "mask and tensor must have the same number of elements"); - - // Determine our output size - int64_t totalElements = THTensor_wrap(mask).sum().item(); - - // The number of `1` elements present in the mask must be <= the - // number of elements available in `src` - if (totalElements > srcSize) { - THArgCheck(false, 2, "source nElements must be == mask `1` elements"); - } - - // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed - // iterator prefix sums? Convert `mask` to the same datatype as what - // we're accumulating the prefix sum in (int64_t) to get around it - THCudaLongTensor* maskLong = THCudaLongTensor_new(state); - at::IntArrayRef maskSizes = mask->sizes(); - THCudaLongTensor_resize(state, maskLong, maskSizes, {}); - THCTensor_(copy)(state, maskLong, mask); - - // Use a prefix sum to determine the output locations of the masked elements - THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); - THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, {}); - - THCThrustAllocator thrustAlloc(state); - thrust::device_ptr - maskData(THCudaLongTensor_data(state, maskLong)); - thrust::device_ptr - maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); - - thrust::exclusive_scan( -#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ - thrust::cuda::par(thrustAlloc).on(c10::cuda::getCurrentCUDAStream()), -#endif - maskData, - maskData + THCudaLongTensor_nElement(state, maskLong), - maskPrefixSumData); - - // We are getting elements from `src` based on an offset from - // `maskPrefixSum`, so that should be made contiguous too - THCTensor* contigSrc = THCTensor_(newContiguous)(state, src); - - // update `tensor` where `mask` == 1 but pull from `src` at - // maskPrefixSum - bool status = THC_pointwiseApply3( - state, tensor, mask, maskPrefixSum, - TensorMaskedCopyOp( - THCTensor_(data)(state, contigSrc))); - - THCTensor_(free)(state, contigSrc); - THCudaLongTensor_free(state, maskLong); - THCudaLongTensor_free(state, maskPrefixSum); - - THArgCheck(status, 2, CUTORCH_DIM_WARNING); - THCudaCheck(cudaGetLastError()); -} - -void THCTensor_(maskedCopyBool)(THCState* state, - THCTensor *tensor, THCudaBoolTensor *mask, THCTensor *src) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); - ptrdiff_t maskSize = THCudaBoolTensor_nElement(state, mask); - ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor); - ptrdiff_t srcSize = THCTensor_(nElement)(state, src); - - // `mask` and `tensor` must have the same number of elements - THArgCheck(maskSize == tensorSize, 2, - "mask and tensor must have the same number of elements"); - - // Determine our output size - int64_t totalElements = THTensor_wrap(mask).sum().item(); - - // The number of `1` elements present in the mask must be <= the - // number of elements available in `src` - if (totalElements > srcSize) { - THArgCheck(false, 2, "source nElements must be == mask `1` elements"); - } - - // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed - // iterator prefix sums? Convert `mask` to the same datatype as what - // we're accumulating the prefix sum in (int64_t) to get around it - THCudaLongTensor* maskLong = THCudaLongTensor_new(state); - at::IntArrayRef maskSizes = mask->sizes(); - THCudaLongTensor_resize(state, maskLong, maskSizes, {}); - THCTensor_(copy)(state, maskLong, mask); - - // Use a prefix sum to determine the output locations of the masked elements - THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); - THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, {}); - - THCThrustAllocator thrustAlloc(state); - thrust::device_ptr - maskData(THCudaLongTensor_data(state, maskLong)); - thrust::device_ptr - maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); - - thrust::exclusive_scan( -#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ - thrust::cuda::par(thrustAlloc).on(c10::cuda::getCurrentCUDAStream()), -#endif - maskData, - maskData + THCudaLongTensor_nElement(state, maskLong), - maskPrefixSumData); - - // We are getting elements from `src` based on an offset from - // `maskPrefixSum`, so that should be made contiguous too - THCTensor* contigSrc = THCTensor_(newContiguous)(state, src); - - // update `tensor` where `mask` == 1 but pull from `src` at - // maskPrefixSum - bool status = THC_pointwiseApply3( - state, tensor, mask, maskPrefixSum, - TensorMaskedCopyOp( - THCTensor_(data)(state, contigSrc))); - - THCTensor_(free)(state, contigSrc); - THCudaLongTensor_free(state, maskLong); - THCudaLongTensor_free(state, maskPrefixSum); - - THArgCheck(status, 2, CUTORCH_DIM_WARNING); - THCudaCheck(cudaGetLastError()); -} - -void THCTensor_(maskedCopyByte)(THCState* state, - THCTensor *tensor, THByteTensor *mask, THCTensor *src) { - THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); - THCudaByteTensor* maskCuda = THTensor_wrap(mask).cuda().unsafeReleaseTensorImpl(); - THCTensor_(copy)(state, maskCuda, mask); - THCTensor_(maskedCopy)(state, tensor, maskCuda, src); - THCudaByteTensor_free(state, maskCuda); -} - #endif diff --git a/aten/src/THC/generic/THCTensorMasked.h b/aten/src/THC/generic/THCTensorMasked.h index 87e95a344973..16f4a262c8a0 100644 --- a/aten/src/THC/generic/THCTensorMasked.h +++ b/aten/src/THC/generic/THCTensorMasked.h @@ -21,23 +21,4 @@ TORCH_CUDA_CU_API void THCTensor_(maskedFillByte)( THByteTensor* mask, scalar_t value); -TORCH_CUDA_CU_API void THCTensor_(maskedCopy)( - THCState* state, - THCTensor* tensor, - THCudaByteTensor* mask, - THCTensor* src); - -TORCH_CUDA_CU_API void THCTensor_(maskedCopyBool)( - THCState* state, - THCTensor* tensor, - THCudaBoolTensor* mask, - THCTensor* src); - -// FIXME: remove now that we have THCudaByteTensor? -TORCH_CUDA_CU_API void THCTensor_(maskedCopyByte)( - THCState* state, - THCTensor* tensor, - THByteTensor* mask, - THCTensor* src); - #endif diff --git a/test/test_torch.py b/test/test_torch.py index f027db3b9ef6..424ab4f40c0f 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1191,49 +1191,6 @@ def test_scatterReduce(self): for method in ["add", "multiply"]: self._test_scatter_base(self, lambda t: t, 'scatter_', reduction=method) - def test_masked_scatter(self): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - for maskType in [torch.uint8, torch.bool]: - for dt in torch.testing.get_all_dtypes(): - num_copy, num_dest = 3, 10 - dest = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dt) - dest2 = dest.clone() - src = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=dt) - mask = torch.tensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0), dtype=maskType) - - if dt == torch.bool: - # torch.bool is a special case and is being tested - # in a separate test - continue - - # TODO: update test when masked scatter is supported for complex - if dt == torch.half or dt.is_complex: - self.assertRaises(RuntimeError, lambda: dest.masked_scatter_(mask, src)) - continue - - dest.masked_scatter_(mask, src) - j = 0 - for i in range(num_dest): - if mask[i]: - dest2[i] = src[j] - j += 1 - self.assertEqual(dest, dest2, atol=0, rtol=0) - - # make source bigger than number of 1s in mask - src = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=dt) - dest.masked_scatter_(mask, src) - - # make src smaller. this should fail - src = torch.zeros(num_copy - 1, dtype=dt) - with self.assertRaises(RuntimeError): - dest.masked_scatter_(mask, src) - self.assertEqual(len(w), 27) - - warn = 'masked_scatter_ received a mask with dtype torch.uint8,' - for wi in w: - self.assertEqual(str(wi.message)[0:55], str(warn)) - def test_masked_fill(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -4521,6 +4478,56 @@ def test_scatter_add_bool(self, device): [False, True, False, True, False], [True, False, True, False, True]], device=device)) + @onlyOnCPUAndCUDA + @dtypes(*torch.testing.get_all_dtypes()) + def test_masked_scatter(self, device, dtype): + dt = dtype + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + for maskType in [torch.uint8, torch.bool]: + num_copy, num_dest = 3, 10 + dest = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dt, device=device) + dest2 = dest.clone() + dest_ones = dest.clone() + dest_ones_expected = dest.clone() + src = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=dt, device=device) + src_ones = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=dt, device=device) + mask = torch.tensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0), dtype=maskType, device=device) + + if dt == torch.bool: + # torch.bool is a special case and is being tested + # in a separate test + return + + # TODO: update test when masked scatter is supported for complex + # and cpu supports half + if (dt == torch.half and self.device_type == 'cpu') or dt.is_complex: + self.assertRaises(RuntimeError, lambda: dest.masked_scatter_(mask, src)) + return + + dest.masked_scatter_(mask, src) + j = 0 + for i in range(num_dest): + if mask[i]: + dest2[i] = src[j] + dest_ones_expected[i] = src_ones[j] + j += 1 + self.assertEqual(dest, dest2, atol=0, rtol=0) + + dest_ones.masked_scatter_(mask, src_ones) + self.assertEqual(dest_ones, dest_ones_expected, atol=0, rtol=0) + + # make src smaller. this should fail + src = torch.zeros(num_copy - 1, dtype=dt, device=device) + with self.assertRaises(RuntimeError): + dest.masked_scatter_(mask, src) + + self.assertEqual(len(w), 3) + + warn = 'masked_scatter_ received a mask with dtype torch.uint8,' + for wi in w: + self.assertEqual(str(wi.message)[0:55], str(warn)) + def test_masked_scatter_bool_tensor(self, device): src = torch.tensor([True, True, True], device=device) dst = torch.tensor([False, False, False], device=device) From 3b6f30824cd131a63f6ac587b97106aa8df4f73b Mon Sep 17 00:00:00 2001 From: Lillian Johnson Date: Wed, 27 Jan 2021 15:01:46 -0800 Subject: [PATCH 16/41] OpInfo JIT op.output_func handling support (#50775) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50775 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D25964541 Pulled By: Lilyjjo fbshipit-source-id: 8cf1ee9191d526cc46ae283f38c2d64bd60afdb2 --- test/test_jit.py | 30 +++++++++---------- test/test_ops.py | 11 ++++--- torch/testing/_internal/common_jit.py | 10 +++---- .../_internal/common_methods_invocations.py | 6 +--- .../_internal/jit_metaprogramming_utils.py | 11 +++---- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index e745f898824a..17f7f66ac8a5 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -15682,7 +15682,7 @@ def check(name): def fn(*inputs, **kwargs): attr = getattr(inputs[0], name) output = attr(*inputs[1:], **kwargs) - return output_process_fn(output) + return output check_types = test_name not in EXCLUDE_TYPE_CHECK # XXX: this test should always run with disable_autodiff_subgraph_inlining(True), @@ -15698,7 +15698,7 @@ def fn(*inputs, **kwargs): traced_fn = create_traced_fn(self, fn) check_against_reference(self, traced_fn, - fn, (self_variable,) + args_variable, kwargs_variable, + fn, output_process_fn, (self_variable,) + args_variable, kwargs_variable, check_types=check_types) if IS_SANDCASTLE: autodiff_nodes = autodiff_nodes + fusible_nodes @@ -15708,9 +15708,9 @@ def fn(*inputs, **kwargs): self.assertAutodiffNode(traced_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes) if not is_magic_method and test_name not in EXCLUDE_SCRIPT: - script_fn = create_script_fn(self, name, 'method', output_process_fn) + script_fn = create_script_fn(self, name, 'method') check_against_reference(self, script_fn, - fn, (self_variable,) + args_variable, kwargs_variable, + fn, output_process_fn, (self_variable,) + args_variable, kwargs_variable, check_types=check_types) if IS_SANDCASTLE: @@ -15725,21 +15725,20 @@ def fn(*inputs, **kwargs): # functional interface tests if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL: def fn(*inputs, **kwargs): - output = getattr(torch, name)(*inputs, **kwargs) - return output_process_fn(output) + return getattr(torch, name)(*inputs, **kwargs) f_args_variable = (self_variable,) + args_variable f_args_tensor = (self_tensor,) + args_tensor if not is_inplace and test_name not in EXCLUDE_TRACED: check_against_reference(self, - create_traced_fn(self, fn), - fn, f_args_variable, kwargs_variable, check_types=check_types) + create_traced_fn(self, fn), fn, output_process_fn, + f_args_variable, kwargs_variable, check_types=check_types) if not is_inplace and test_name not in EXCLUDE_SCRIPT: check_against_reference(self, - create_script_fn(self, name, 'functional', output_process_fn), - fn, f_args_variable, kwargs_variable, + create_script_fn(self, name, 'functional'), + fn, output_process_fn, f_args_variable, kwargs_variable, check_types=check_types) # alias annotation testing @@ -15781,8 +15780,7 @@ def do_test(self, name=name, args=args, test_name=test_name, check_ad=check_ad): output_variable = getattr(F, name)(self_variable, *args_variable, **kwargs_variable) def fn(*inputs, **kwargs): - output = getattr(F, name)(*inputs, **kwargs) - return output_process_fn(output) + return getattr(F, name)(*inputs, **kwargs) f_args_variable = (self_variable,) + args_variable f_args_tensor = (self_tensor,) + args_tensor @@ -15793,8 +15791,9 @@ def run_test(): # XXX: this test should always run with disable_autodiff_subgraph_inlining(True), # so that we don't regress on autodiff support. with disable_autodiff_subgraph_inlining(): - script_fn = create_script_fn(self, name, 'nn_functional', output_process_fn) - check_against_reference(self, script_fn, fn, f_args_variable, kwargs_variable, no_grad=no_grad) + script_fn = create_script_fn(self, name, 'nn_functional') + check_against_reference(self, script_fn, fn, output_process_fn, + f_args_variable, kwargs_variable, no_grad=no_grad) # For tests we disabled AD subgraph inlining, make sure it's not falling back to autograd if (doAutodiffCheck(test_name)): self.assertAutodiffNode(script_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes) @@ -15914,7 +15913,8 @@ def create_nn_module(*args, **kwargs): f_args_variable = deepcopy(unpack_variables(args_variable)) # Check against Python module as reference - check_against_reference(self, create_script_module, create_nn_module, f_args_variable, no_grad=no_grad) + check_against_reference(self, create_script_module, create_nn_module, + lambda x: x, f_args_variable, no_grad=no_grad) if 'slowTest' in kwargs: do_test = slowTest(do_test) diff --git a/test/test_ops.py b/test/test_ops.py index 40627cd4b264..a01f96fe877b 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -278,17 +278,15 @@ def test_variant_consistency_jit(self, device, dtype, op): # autodiff support. Context manager forces the graph to contain # DifferentiableGraph nodes if they are present with disable_autodiff_subgraph_inlining(): - def fn(*inputs, **kwargs): - output = func(*inputs, **kwargs) - return op.output_func(output) # Check scripted forward, grad, and grad grad - script_fn = create_script_fn(self, name, func_type, op.output_func) + script_fn = create_script_fn(self, name, func_type) check_against_reference(self, script_fn, - fn, + func, + op.output_func, (*sample.input,) + sample.args, sample.kwargs, no_grad=not test_backward) @@ -297,7 +295,8 @@ def fn(*inputs, **kwargs): traced_fn = create_traced_fn(self, variant) check_against_reference(self, traced_fn, - fn, + func, + op.output_func, (*sample.input,) + sample.args, sample.kwargs, no_grad=not test_backward) diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py index 8c2b407beea1..a93e13b665be 100644 --- a/torch/testing/_internal/common_jit.py +++ b/torch/testing/_internal/common_jit.py @@ -36,7 +36,7 @@ def check_output_types(self, func, ref_outputs, args, kwargs): 'grid_sample', ]) -def check_against_reference(self, func, reference_func, args, kwargs=None, +def check_against_reference(self, func, reference_func, output_func, args, kwargs=None, allow_unused=True, check_types=True, no_grad=False): kwargs = kwargs if kwargs else {} @@ -72,10 +72,10 @@ def clone_inputs(requires_grad): with enable_profiling_mode_for_profiling_tests(): # test single grad case - outputs = self.runAndSaveRNG(reference_func, recording_inputs, kwargs) + outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs)) grads = torch.autograd.grad(allSum(outputs), recording_tensors, allow_unused=allow_unused) - outputs_test = self.runAndSaveRNG(func, recording_inputs, kwargs) + outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs)) grads_test = torch.autograd.grad(allSum(outputs_test), recording_tensors, allow_unused=allow_unused) self.assertEqual(outputs, outputs_test) @@ -84,7 +84,7 @@ def clone_inputs(requires_grad): if self._testMethodName in nn_functional_single_grad: return - outputs = self.runAndSaveRNG(reference_func, recording_inputs, kwargs) + outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs)) l1 = allSum(outputs) grads = torch.autograd.grad(l1, recording_tensors, create_graph=True, allow_unused=allow_unused) @@ -92,7 +92,7 @@ def clone_inputs(requires_grad): l2 = (allSum(grads) * l1) grads2 = torch.autograd.grad(l2, recording_tensors, allow_unused=allow_unused) recording_inputs, recording_tensors = clone_inputs(True) - outputs_test = self.runAndSaveRNG(func, recording_inputs, kwargs) + outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs)) l1_test = allSum(outputs_test) grads_test = torch.autograd.grad( l1_test, recording_tensors, create_graph=True, allow_unused=allow_unused) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 2ad65a9631dc..324a9346c45b 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1326,11 +1326,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad): supports_tensor_out=False, sample_inputs_func=sample_inputs_slogdet, output_func=itemgetter(1), - decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack], - skips=( - # These tests do not work with output_func=itemgetter(1) - # TODO: remove this once https://github.com/pytorch/pytorch/issues/49326 is resolved - SkipInfo('TestCommon', 'test_variant_consistency_jit'),)), + decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]), UnaryUfuncInfo('log', ref=np.log, domain=(0, float('inf')), diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py index cd134b38aba9..25ab7d1fc3f5 100644 --- a/torch/testing/_internal/jit_metaprogramming_utils.py +++ b/torch/testing/_internal/jit_metaprogramming_utils.py @@ -290,14 +290,15 @@ def gen_script_fn_and_args(method_name, func_type, *args, **kwargs): CU = torch.jit.CompilationUnit(script) return CU.the_method, tensors -# create a script function from (name, func_type, output_process_fn), -# returns a function takes in (args, kwargs) and runs the compiled function and -# then applies the post process fn to the outputs -def create_script_fn(self, method_name, func_type, output_process_fn): +# create a script function from (name, func_type), +# returns a function takes in (args, kwargs) and runs the compiled function +def create_script_fn(self, method_name, func_type): + # function returns tuple containing original output and + # filtered output to be used in checking gradients def script_fn(*args, **kwargs): fn, tensors = gen_script_fn_and_args(method_name, func_type, *args, **kwargs) self.assertExportImport(fn.graph, tensors) - output = output_process_fn(fn(*tensors)) + output = fn(*tensors) # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087 script_fn.last_graph = fn.graph_for(*tensors) # type: ignore[attr-defined] return output From 2de4ecd4ebc99d509b8f13ff12ed241c7433a0ad Mon Sep 17 00:00:00 2001 From: anjali411 Date: Wed, 27 Jan 2021 15:17:09 -0800 Subject: [PATCH 17/41] Add serialization logic for complex numbers (#50885) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50885 Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D26094906 Pulled By: anjali411 fbshipit-source-id: 7b2614f3ee4a30c4b4cf04aaa3432988b38a0721 --- test/test_complex.py | 12 ++++++++++++ torch/csrc/jit/python/pybind_utils.h | 2 ++ torch/csrc/jit/serialization/pickler.cpp | 10 ++++++++++ torch/csrc/jit/serialization/pickler.h | 1 + torch/csrc/jit/serialization/unpickler.cpp | 13 ++++++++++--- 5 files changed, 35 insertions(+), 3 deletions(-) diff --git a/test/test_complex.py b/test/test_complex.py index e6a705032d74..cc8ed7f62398 100644 --- a/test/test_complex.py +++ b/test/test_complex.py @@ -12,6 +12,18 @@ def fn(a: complex): self.checkScript(fn, (3 + 5j,)) + def test_pickle(self): + class ComplexModule(torch.jit.ScriptModule): + def __init__(self): + super().__init__() + self.a = 3 + 5j + + def forward(self, b: int): + return b + + loaded = self.getExportImportCopy(ComplexModule()) + self.assertEqual(loaded.a, 3 + 5j) + class TestComplexTensor(TestCase): @dtypes(*torch.testing.get_all_complex_dtypes()) def test_to_list(self, device, dtype): diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 06c57f32f15c..eca56876999f 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -293,6 +293,8 @@ inline InferredType tryToInferType(py::handle input) { return InferredType(IntType::get()); } else if (py::isinstance(input)) { return InferredType(FloatType::get()); + } else if (PyComplex_CheckExact(input.ptr())) { + return InferredType(ComplexDoubleType::get()); } else if (py::isinstance(input)) { return InferredType(StringType::get()); } else if (THPLayout_Check(input.ptr())) { diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index e2118af2019d..a0dd826c4267 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -49,6 +49,8 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { pushTuple(ivalue); } else if (ivalue.isDouble()) { pushDouble(ivalue.toDouble()); + } else if (ivalue.isComplexDouble()) { + pushComplexDouble(ivalue); } else if (ivalue.isInt()) { pushInt(ivalue.toInt()); } else if (ivalue.isBool()) { @@ -464,6 +466,14 @@ void Pickler::pushDouble(double value) { // Python pickle format is big endian, swap. push(swapDouble(value)); } +void Pickler::pushComplexDouble(const IValue& value) { + c10::complex d = value.toComplexDouble(); + pushGlobal("builtins", "complex"); + pushIValue(d.real()); + pushIValue(d.imag()); + push(PickleOpCode::TUPLE2); + push(PickleOpCode::REDUCE); +} void Pickler::pushLong(const std::string& data) { uint64_t size = data.size(); diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index 21d0f61a18eb..4dc216ec702b 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -160,6 +160,7 @@ class TORCH_API Pickler { void endTypeTag(const IValue& value); void pushBool(bool value); void pushDouble(double value); + void pushComplexDouble(const IValue& value); void pushGenericList(const IValue& ivalue); void pushIntList(const IValue& ivalue); void pushList(const IValue& ivalue); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index f363fe73f1e9..efeaac75c41c 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -57,6 +57,7 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) { case StorageType::Kind: case NumberType::Kind: case FloatType::Kind: + case ComplexDoubleType::Kind: case IntType::Kind: case NoneType::Kind: case GeneratorType::Kind: @@ -80,9 +81,6 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) { case AnyEnumType::Kind: // no op, there is nothing to tag break; - // TODO(@anjali411): Implement serialization/deserialization for complex - // numbers - case ComplexDoubleType::Kind: case EnumType::Kind: // TODO(gmagogsfm): Implement serialization/deserialization of Enum. AT_ASSERT(false); @@ -543,6 +541,15 @@ void Unpickler::readGlobal( // Unpickle a tensor bool quantized = class_name == "_rebuild_qtensor"; rebuildTensor(quantized); + } else if (module_name == "builtins" && class_name == "complex") { + globals_.emplace_back([this] { + auto elems = pop(stack_).toTuple()->elements(); + AT_ASSERT(elems.size() == 2); + auto complex = + c10::complex(elems.at(0).toDouble(), elems.at(1).toDouble()); + stack_.emplace_back(complex); + }); + } else if (module_name == "collections" && class_name == "OrderedDict") { // collections.OrderedDict is used in tensor serialization for a tensor's // backward hooks (but they are not actually saved with this Pickler) From 621198978a4073df213757cc29d305ee25405446 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Wed, 27 Jan 2021 15:41:41 -0800 Subject: [PATCH 18/41] Move USE_NUMPY to more appropriate targets (#51143) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51143 Test Plan: CI Reviewed By: wconstab Differential Revision: D26084123 fbshipit-source-id: af4abe4ef87c1ebe5434938320526a925f5c34c8 --- caffe2/CMakeLists.txt | 3 +++ caffe2/core/macros.h.in | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 62f9e8be3e4c..1b1656d1fffa 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1484,6 +1484,7 @@ if(BUILD_PYTHON) # ---[ Python. if(BUILD_CAFFE2) add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS}) + target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY") if(NOT MSVC) set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") endif() @@ -1514,6 +1515,7 @@ if(BUILD_PYTHON) if(USE_CUDA) add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS}) + target_compile_options(caffe2_pybind11_state_gpu PRIVATE "-DUSE_NUMPY") if(NOT MSVC) set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden") endif() @@ -1542,6 +1544,7 @@ if(BUILD_PYTHON) if(USE_ROCM) add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS}) + target_compile_options(caffe2_pybind11_state_hip PRIVATE "-DUSE_NUMPY") if(NOT MSVC) target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_CXX_FLAGS} -fvisibility=hidden) endif() diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in index dd9f9902be1f..bd9a447b879d 100644 --- a/caffe2/core/macros.h.in +++ b/caffe2/core/macros.h.in @@ -44,10 +44,6 @@ static_assert( #cmakedefine CAFFE2_USE_NVTX #cmakedefine CAFFE2_USE_TRT -#ifndef USE_NUMPY -#cmakedefine USE_NUMPY -#endif - #ifndef EIGEN_MPL2_ONLY #cmakedefine EIGEN_MPL2_ONLY #endif From 98d9a6317d0818d70d154fe1d8fa7efc3c38c4ae Mon Sep 17 00:00:00 2001 From: yangu Date: Wed, 27 Jan 2021 15:46:46 -0800 Subject: [PATCH 19/41] Rename profile.next_step() to profile.step() to consistent with optimizer.step() (#51032) Summary: Similar with Optimizer.step(), profile.next_step() occurs every iteration and calls at the end of each iteration. So it's better to make them same naming style. Pull Request resolved: https://github.com/pytorch/pytorch/pull/51032 Reviewed By: heitorschueroff Differential Revision: D26097847 Pulled By: ilia-cher fbshipit-source-id: ea2e5c8e865d99f90b004ec7797271217efeeb68 --- test/test_profiler.py | 2 +- torch/profiler/profiler.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/test/test_profiler.py b/test/test_profiler.py index 826a9f5d0b57..9dfe099163a7 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -267,7 +267,7 @@ def trace_handler(p): ) as p: for idx in range(8): self.payload() - p.next_step() + p.step() self.assertEqual(called_num[0], 2) diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py index 25bee1c2019f..8024bb727a43 100644 --- a/torch/profiler/profiler.py +++ b/torch/profiler/profiler.py @@ -86,7 +86,7 @@ class profile(object): print(p.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1)) - Usimg the profiler's ``schedule``, ``on_trace_ready`` and ``next_step`` functions: + Usimg the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions: .. code-block:: python @@ -96,7 +96,7 @@ class profile(object): def trace_handler(prof): print(prof.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1)) - # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step()) + ".json") + # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json") with torch.profiler.profile( activities=[ @@ -120,7 +120,7 @@ def trace_handler(prof): for iter in range(N): code_iteration_to_profile(iter) # send a signal to the profiler that the next iteration has started - p.next_step() + p.step() """ def __init__( self, @@ -172,7 +172,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.step_rec_fn.__exit__(None, None, None) self._exit_actions() - def next_step(self): + def step(self): """ Signals the profiler that the next profiling step has started. """ @@ -232,12 +232,6 @@ def next_step(self): self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num)) self.step_rec_fn.__enter__() - def step(self): - """ - Returns the current profiling step. - """ - return self.step_num - def export_chrome_trace(self, path: str): """ Exports the collected trace in Chrome JSON format. From 1321f2bfe6a100cb327e20a9d285bd51b650231a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 27 Jan 2021 15:50:51 -0800 Subject: [PATCH 20/41] [PyTorch] Port Caffe2 opti for BatchMatMul batch size 1 to baddbmm (#51057) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51057 Caffe2 has an [optimization](https://github.com/pytorch/pytorch/blob/f8eefbdf7a229abbb864e47e0b664c7628d80224/caffe2/operators/batch_matmul_op.h#L192) for the case where the batch size is 1 that uses the underlying `gemm` instead of `gemm_batched` BLAS function. This diff tries to port that optimization to `baddbmm_mkl`. Note that I have very little linear algebra background and am just going off existing code and cblas API documentation, so please review without assuming I know what I'm doing with the math itself. ghstack-source-id: 120342923 Reviewed By: hlu1 Differential Revision: D26056613 fbshipit-source-id: feef80344b96601fc2bd0a2e8c8f6b57510d7856 --- aten/src/ATen/native/mkl/LinearAlgebra.cpp | 59 ++++++++++++++++++++-- test/test_linalg.py | 42 +++++++-------- 2 files changed, 78 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp index 0fc22c2c637d..cb14f9ae3333 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -32,6 +32,36 @@ Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, namespace at { namespace native { +static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int M, const int N, const int K, const float alpha, const float* A, + const int lda, const float* B, const int ldb, const float beta, float* C, const int ldc) { + cblas_sgemm(CblasRowMajor, trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int M, const int N, const int K, const double alpha, const double* A, + const int lda, const double* B, const int ldb, const double beta, double* C, const int ldc) { + cblas_dgemm(CblasRowMajor, trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int M, const int N, const int K, const c10::complex alpha, + const c10::complex* A, const int lda, const c10::complex* B, const int ldb, + const c10::complex beta, c10::complex* C, const int ldc) { + cblas_cgemm(CblasRowMajor, trans_A, trans_B, M, N, K, reinterpret_cast(&alpha), + reinterpret_cast(A), lda, reinterpret_cast(B), ldb, + reinterpret_cast(&beta), reinterpret_cast(C), ldc); +} + +static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int M, const int N, const int K, const c10::complex alpha, + const c10::complex* A, const int lda, const c10::complex* B, const int ldb, + const c10::complex beta, c10::complex* C, const int ldc) { + cblas_zgemm(CblasRowMajor, trans_A, trans_B, M, N, K, reinterpret_cast(&alpha), + reinterpret_cast(A), lda, reinterpret_cast(B), ldb, + reinterpret_cast(&beta), reinterpret_cast(C), ldc); +} + static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, const int M, const int N, const int K, const float alpha, const float** A, const int lda, const float** B, const int ldb, const float beta, @@ -101,6 +131,31 @@ static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, c const int ldb = trans_B == CblasTrans ? mat2_strides[2] : mat2_strides[1]; const int ldc = res.strides()[1]; + // avoid using tensor accessor in the case of mat1/mat2 not being transposed + // or only transposed in the last two axes + const bool canAvoidTensorAccessor = mat1_strides[0] == mat1_sizes[1] * mat1_sizes[2] && + mat2_strides[0] == mat2_sizes[1] * mat2_sizes[2]; + + scalar_t* const res_data = static_cast(res.data_ptr()); + + if (batch_size == 1) { + const scalar_t* A; + const scalar_t* B; + if (canAvoidTensorAccessor) { + scalar_t* mat1_data = static_cast(mat1.data_ptr()); + scalar_t* mat2_data = static_cast(mat2.data_ptr()); + A = mat1_data; + B = mat2_data; + } else { + auto mat1_acc = mat1.accessor(); + auto mat2_acc = mat2.accessor(); + A = mat1_acc[0].data(); + B = mat2_acc[0].data(); + } + gemm(trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, res_data, ldc); + return; + } + std::vector A; A.reserve(batch_size); std::vector B; @@ -110,10 +165,8 @@ static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, c // avoid using tensor accessor in the case of mat1/mat2 not being transposed // or only transposed in the last two axis - scalar_t* res_data = static_cast(res.data_ptr()); const auto res_sizes = res.sizes(); - if (mat1_strides[0] == mat1_sizes[1] * mat1_sizes[2] && - mat2_strides[0] == mat2_sizes[1] * mat2_sizes[2]) { + if (canAvoidTensorAccessor) { scalar_t* mat1_data = static_cast(mat1.data_ptr()); scalar_t* mat2_data = static_cast(mat2.data_ptr()); for (int64_t batch = 0; batch < batch_size; batch++) { diff --git a/test/test_linalg.py b/test/test_linalg.py index fd70ebaad04f..4f061dccc0de 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -4727,7 +4727,7 @@ def test_bmm(self, device, dtype): # undefined bahavior return - num_batches = 10 + batch_sizes = [1, 10] M, N, O = 23, 8, 12 numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32 @@ -4736,17 +4736,18 @@ def test_bmm(self, device, dtype): is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) if not is_supported: - b1 = torch.randn(num_batches, M, N, device=device).to(dtype) - b2 = torch.randn(num_batches, N, O, device=device).to(dtype) - self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED", - lambda: torch.bmm(b1, b2)) + for num_batches in batch_sizes: + b1 = torch.randn(num_batches, M, N, device=device).to(dtype) + b2 = torch.randn(num_batches, N, O, device=device).to(dtype) + self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED", + lambda: torch.bmm(b1, b2)) return def invert_perm(p): d = {x: i for i, x in enumerate(p)} return (d[0], d[1], d[2]) - def generate_inputs(): + def generate_inputs(num_batches): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) @@ -4769,21 +4770,22 @@ def generate_inputs(): b2 = torch.randn(shape2, dtype=dtype, device=device) yield b1, b2 - for (b1, b2), perm3 in itertools.product(generate_inputs(), itertools.permutations((0, 1, 2))): - res1 = torch.bmm(b1, b2) - res2 = torch.full((num_batches, M, O), math.nan, dtype=dtype, device=device) \ - .permute(perm3).contiguous().permute(invert_perm(perm3)) - torch.bmm(b1, b2, out=res2) - expect = torch.from_numpy( - b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype) - self.assertEqual(expect, res1) - self.assertEqual(expect, res2) + for num_batches in batch_sizes: + for (b1, b2), perm3 in itertools.product(generate_inputs(num_batches), itertools.permutations((0, 1, 2))): + res1 = torch.bmm(b1, b2) + res2 = torch.full((num_batches, M, O), math.nan, dtype=dtype, device=device) \ + .permute(perm3).contiguous().permute(invert_perm(perm3)) + torch.bmm(b1, b2, out=res2) + expect = torch.from_numpy( + b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype) + self.assertEqual(expect, res1) + self.assertEqual(expect, res2) - if self.device_type == 'cuda': - # check that mixed arguments are rejected - self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2.cpu())) - self.assertRaises(RuntimeError, lambda: torch.bmm(b1.cpu(), b2)) - self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2, out=res2.cpu())) + if self.device_type == 'cuda': + # check that mixed arguments are rejected + self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2.cpu())) + self.assertRaises(RuntimeError, lambda: torch.bmm(b1.cpu(), b2)) + self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2, out=res2.cpu())) @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error") @onlyCUDA From 3f23ad5bce75539a23e8d462b4339e75a6171095 Mon Sep 17 00:00:00 2001 From: "Luca(Wei) Chen" Date: Wed, 27 Jan 2021 15:55:58 -0800 Subject: [PATCH 21/41] [Bug] fix for module_has_exports (#50680) Summary: The attributes in `dir(mod)` may not be valid, this will throw error when calling `getattr`. Use `hasattr` to test if it is valid. Here is an example: ```python class A: def __init__(self, x): if x: self._attr = 1 property def val(self): return getattr(self, '_attr') a = A(False) print('val' in dir(a)) print(hasattr(a, 'val')) b = A(True) print('val' in dir(b)) print(hasattr(b, 'val')) ``` And the outputs: ``` True False True True ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/50680 Reviewed By: malfet Differential Revision: D26103975 Pulled By: eellison fbshipit-source-id: 67a799afe7d726153c91654d483937c5e198ba94 --- test/jit/test_tracer.py | 18 ++++++++++++++++++ torch/_jit_internal.py | 9 +++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 9dff4b0f4549..71f572dedb54 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -1877,6 +1877,24 @@ def forward(self, inputs): tm = torch.jit.trace(m, torch.tensor(1.)) self.assertFalse(hasattr(tm, "submod")) + def test_trace_with_conditional_property(self): + class Net(nn.Module): + def __init__(self, attr=None): + super(Net, self).__init__() + if attr is not None: + self._attr = attr + self.attr_name = '_attr' + + @property + def attr(self): + return getattr(self, self.attr_name) + + def forward(self, x): + return x + + x = torch.ones(1) + torch.jit.trace(Net(), x) + class TestMixTracingScripting(JitTestCase): def test_trace_script(self): diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index f4dabcbf97de..ad7a2cf4ba88 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -600,10 +600,11 @@ def _copy_to_script_wrapper(fn): def module_has_exports(mod): for name in dir(mod): - item = getattr(mod, name) - if callable(item): - if get_torchscript_modifier(item) is FunctionModifiers.EXPORT: - return True + if hasattr(mod, name): + item = getattr(mod, name) + if callable(item): + if get_torchscript_modifier(item) is FunctionModifiers.EXPORT: + return True return False def should_drop(fn): From 42aeb68128e914676df1b82f162eeda56343ba6e Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 27 Jan 2021 15:58:35 -0800 Subject: [PATCH 22/41] [TensorExpr] Move 'initializer' field from 'Tensor' to 'Buf'. (#50993) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50993 This is the first step to make 'Tensor` a thin wrapper over 'Buf' and 'Stmt', which will be finished in subsequent PRs. This change also allows to remove 'buf_initializers_' from 'LoopNest', making it "less stateful". Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D26038224 Pulled By: ZolotukhinM fbshipit-source-id: f418816e54c62f291fa45812901487394e9b95b5 --- test/cpp/tensorexpr/test_loopnest.cpp | 8 +++--- torch/csrc/jit/tensorexpr/expr.h | 20 ++++++++++--- torch/csrc/jit/tensorexpr/loopnest.cpp | 39 +++++++++----------------- torch/csrc/jit/tensorexpr/loopnest.h | 11 ++------ torch/csrc/jit/tensorexpr/tensor.h | 12 ++------ 5 files changed, 39 insertions(+), 51 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 89ad7eb2aecb..74f69079e2db 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -3566,7 +3566,7 @@ TEST(LoopNest, DeadStoreElimination) { Stmt* stmt = Block::make({stmt1}); // Will eliminate if not used by an output. - LoopNest loop(stmt, {f.node()}, {}, {}); + LoopNest loop(stmt, {f.node()}, {}); loop.eliminateDeadStores(); std::ostringstream oss; @@ -3580,7 +3580,7 @@ TEST(LoopNest, DeadStoreElimination) { torch::jit::testing::FileCheck().run(expected_ir, oss.str()); // But won't eliminate if used by different outputs. - LoopNest loop2(stmt, {f.node(), g.node()}, {}, {}); + LoopNest loop2(stmt, {f.node(), g.node()}, {}); loop2.eliminateDeadStores(); oss.clear(); @@ -3621,7 +3621,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { // Will eliminate the write to g, but not f since it used by the producer of // h. - LoopNest loop(stmt, {h.node()}, {}, {}); + LoopNest loop(stmt, {h.node()}, {}); loop.eliminateDeadStores(); std::ostringstream oss; @@ -3636,7 +3636,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { torch::jit::testing::FileCheck().run(expected_ir, oss.str()); // Sanity check won't eliminate if g is an output. - LoopNest loop2(stmt, {h.node(), g.node()}, {}, {}); + LoopNest loop2(stmt, {h.node(), g.node()}, {}); loop2.eliminateDeadStores(); oss.clear(); diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 92aad34d3b7d..9645051e6a9a 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -183,11 +183,18 @@ class TORCH_API Buf : public ExprNode { Buf(const std::string& name_hint, const std::vector& dims, - Dtype dtype) - : Buf(new Var(name_hint, kHandle), dims, dtype) {} + Dtype dtype, + const Expr* initializer = nullptr) + : Buf(new Var(name_hint, kHandle), dims, dtype, initializer) {} - Buf(const Var* var, const std::vector& dims, Dtype dtype) - : ExprNodeBase(dtype, kPrimitive), base_handle_(var), dims_(dims) { + Buf(const Var* var, + const std::vector& dims, + Dtype dtype, + const Expr* initializer = nullptr) + : ExprNodeBase(dtype, kPrimitive), + base_handle_(var), + dims_(dims), + initializer_(initializer) { TORCH_CHECK(var); } @@ -207,9 +214,14 @@ class TORCH_API Buf : public ExprNode { dims_ = dims; }; + const Expr* initializer() const { + return initializer_; + }; + private: const Var* base_handle_; std::vector dims_; + const Expr* initializer_; }; class TORCH_API BufHandle : public ExprHandle { diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 71869d1d33d7..b189f04d55b2 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -544,10 +544,7 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) { return body; } - const Expr* initializer = t->initializer(); - if (initializer) { - buf_initializers_[t->buf()] = initializer; - } + const Expr* init_expr = t->buf()->initializer(); std::vector indices(t->args().begin(), t->args().end()); @@ -561,8 +558,8 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) { t->reduce_dim(dim_index), body); } - if (initializer) { - Store* init = new Store(t->buf(), indices, initializer, new IntImm(1)); + if (init_expr) { + Store* init = new Store(t->buf(), indices, init_expr, new IntImm(1)); body = new Block({init, body}); } } @@ -2352,8 +2349,9 @@ void LoopNest::rfactor( } std::vector new_dims = {}; - Buf* tmp_buf = - new Buf(new Var("tmp_buf", kHandle), new_dims, reduce_op->dtype()); + const Expr* init = reduce_op->accumulator()->initializer(); + TORCH_INTERNAL_ASSERT(init); + Buf* tmp_buf = new Buf("tmp_buf", new_dims, reduce_op->dtype(), init); auto old_acc = reduce_op->accumulator(); auto new_inner = reduce_op->reduce_args(); @@ -2425,26 +2423,17 @@ void LoopNest::rfactor( throw std::runtime_error("TODO: enable non-root insertion points"); } - // From this point forward any errors cannot be handled silently. - auto init_it = buf_initializers_.find(reduce_op->accumulator()); - if (init_it != buf_initializers_.end()) { - buf_initializers_[tmp_buf] = init_it->second; - Stmt* init_stmt = - new Store(tmp_buf, new_outer, init_it->second, new IntImm(1)); + Stmt* init_stmt = new Store(tmp_buf, new_outer, init, new IntImm(1)); - // Wrap it in any loops lower than the insertion point of the new reduction. - for (auto* ol : output_loops) { - init_stmt = ol->cloneWithNewBody(init_stmt); - } + // Wrap it in any loops lower than the insertion point of the new reduction. + for (auto* ol : output_loops) { + init_stmt = ol->cloneWithNewBody(init_stmt); + } - if (output_contains_target) { - parent_block->insert_stmt_before(init_stmt, new_root_for); - } else { - new_root_for->body()->prepend_stmt(init_stmt); - } + if (output_contains_target) { + parent_block->insert_stmt_before(init_stmt, new_root_for); } else { - // We may support this but not possible now. - throw std::runtime_error("can't rfactor reduction with no initializer\n"); + new_root_for->body()->prepend_stmt(init_stmt); } auto second_buf = dynamic_cast(second_reduce->accumulator()); diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index 3f37468c0a80..6c016f9f55ed 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -28,17 +28,14 @@ class TORCH_API LoopNest { LoopNest(const std::vector& output_tensors); // A constructor for building a LoopNest from a pre-baked Stmt and meta-info - // TODO: Nuke intermediate_bufs_ and possibly buf_initializers from here if - // they can be deduced. + // TODO: Nuke intermediate_bufs_ from here if they can be deduced. LoopNest( Stmt* stmt, const std::unordered_set& output_bufs, - const std::unordered_set& intermediate_bufs, - const std::unordered_map& buf_initializers) + const std::unordered_set& intermediate_bufs) : root_stmt_(stmt), output_bufs_(output_bufs), - intermediate_bufs_(intermediate_bufs), - buf_initializers_(buf_initializers) {} + intermediate_bufs_(intermediate_bufs) {} Stmt* root_stmt() const { return root_stmt_; @@ -133,8 +130,6 @@ class TORCH_API LoopNest { std::unordered_set input_bufs_; std::unordered_set output_bufs_; std::unordered_set intermediate_bufs_; - // Holds the initializer Expr of buffers that have been initialized. - std::unordered_map buf_initializers_; }; TORCH_API Stmt* FlattenIndexes(Stmt* s); diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index e5e399db348b..53e0faa9eb16 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -90,12 +90,6 @@ class TORCH_API Tensor : KernelScopedObject { return reduce_args_[index]; } - void initializeTo(const Expr* initializer) { - initializer_ = initializer; - } - const Expr* initializer() const { - return initializer_; - } virtual Stmt* ElementStmt() const; template @@ -111,8 +105,6 @@ class TORCH_API Tensor : KernelScopedObject { const Expr* body_; std::vector reduce_dims_; std::vector reduce_args_; - - const Expr* initializer_{nullptr}; }; class TORCH_API CompoundTensor : public Tensor { @@ -268,12 +260,12 @@ Tensor* Reduce( ExprHandle body = Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(all_vars)); std::vector output_args(vars.begin(), vars.end()); - Buf* func_result = new Buf(func_name, dims, body.dtype()); + const Expr* init_expr = new Cast(body.dtype(), reducer.initializer()); + Buf* func_result = new Buf(func_name, dims, body.dtype(), init_expr); const ReduceOp* reduce_op = reducer(func_result, body, output_args, reduce_vars); Tensor* t = new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op); - t->initializeTo(new Cast(body.dtype(), reducer.initializer())); return t; } From b804084428506675867d7b20eeee0cdee490a409 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 27 Jan 2021 15:58:35 -0800 Subject: [PATCH 23/41] [TensorExpr] Move 'lowerToStmt' method from 'LoopNest' to 'Tensor'. (#50994) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50994 Eventually, 'Tensor' will be fully responsible for its 'Stmt' and moving this method to it is one step in that direction. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D26038222 Pulled By: ZolotukhinM fbshipit-source-id: 0549f0ae6b46a93ff7608a22e79faa5115eef661 --- torch/csrc/jit/tensorexpr/loopnest.cpp | 42 +------------------------- torch/csrc/jit/tensorexpr/loopnest.h | 1 - torch/csrc/jit/tensorexpr/tensor.cpp | 37 +++++++++++++++++++++++ torch/csrc/jit/tensorexpr/tensor.h | 2 ++ 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index b189f04d55b2..6e63743496cf 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -508,7 +508,7 @@ LoopNest::LoopNest(const std::vector& output_tensors) { std::vector loops; for (Tensor* t : tensors_to_compute) { - Stmt* loop = lowerToStmt(t); + Stmt* loop = t->lowerToStmt(); // Flatten initializers. if (Block* block = dynamic_cast(loop)) { for (auto* s : block->stmts()) { @@ -532,46 +532,6 @@ LoopNest::LoopNest(const std::vector& output_tensors) { } } -Stmt* LoopNest::lowerToStmt(Tensor* t) { - Stmt* body = t->ElementStmt(); - - // If this Tensor has no functional body, it already has its axes expanded. - if (nullptr == t->body()) { - return body; - } - - if (t->ndim() == 0 && t->reduce_ndim() == 0) { - return body; - } - - const Expr* init_expr = t->buf()->initializer(); - - std::vector indices(t->args().begin(), t->args().end()); - - if (t->reduce_ndim() > 0) { - for (size_t i = 0; i < t->reduce_ndim(); i++) { - // Going in reverse order: from innermost loop to the outermost - size_t dim_index = t->reduce_ndim() - i - 1; - body = new For( - t->reduce_arg(dim_index), - new IntImm(0), - t->reduce_dim(dim_index), - body); - } - if (init_expr) { - Store* init = new Store(t->buf(), indices, init_expr, new IntImm(1)); - body = new Block({init, body}); - } - } - - for (size_t i = 0; i < t->ndim(); i++) { - // Going in reverse order: from innermost loop to the outermost - size_t dim_index = t->ndim() - i - 1; - body = new For(t->arg(dim_index), new IntImm(0), t->dim(dim_index), body); - } - return body; -} - class FunctionInliner : public IRMutator { public: FunctionInliner(Store* producer, std::unordered_set outputs) diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index 6c016f9f55ed..1bff2e96d415 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -122,7 +122,6 @@ class TORCH_API LoopNest { private: std::vector findAllNeededTensors( const std::vector& tensors); - Stmt* lowerToStmt(Tensor* t); Stmt* insertAllocFree(Stmt* stmt); Stmt* root_stmt_; diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index d12f6999c8d5..e32f9f293940 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -8,6 +8,43 @@ namespace torch { namespace jit { namespace tensorexpr { +Stmt* Tensor::lowerToStmt() const { + Stmt* s = ElementStmt(); + + // If this Tensor has no functional body, it already has its axes expanded. + if (nullptr == body()) { + return s; + } + + if (ndim() == 0 && reduce_ndim() == 0) { + return s; + } + + const Expr* init_expr = buf()->initializer(); + + std::vector indices(args().begin(), args().end()); + + if (reduce_ndim() > 0) { + for (size_t i = 0; i < reduce_ndim(); i++) { + // Going in reverse order: from innermost loop to the outermost + size_t dim_index = reduce_ndim() - i - 1; + s = new For( + reduce_arg(dim_index), new IntImm(0), reduce_dim(dim_index), s); + } + if (init_expr) { + Store* init = new Store(buf(), indices, init_expr, new IntImm(1)); + s = new Block({init, s}); + } + } + + for (size_t i = 0; i < ndim(); i++) { + // Going in reverse order: from innermost loop to the outermost + size_t dim_index = ndim() - i - 1; + s = new For(arg(dim_index), new IntImm(0), dim(dim_index), s); + } + return s; +} + Tensor* Compute( const std::string& func_name, const std::vector& dim_args, diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index 53e0faa9eb16..31454f745944 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -99,6 +99,8 @@ class TORCH_API Tensor : KernelScopedObject { template inline ExprHandle call(const Ts&... ts); + Stmt* lowerToStmt() const; + private: const Buf* buf_; std::vector args_; From e9751694266130eb9dc7be64fc0b99e4ad6fcb6c Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 27 Jan 2021 15:58:35 -0800 Subject: [PATCH 24/41] [TensorExpr] Redesign `Tensor` class. (#50995) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50995 This change makes 'Tensor' a thin wrapper over 'Buf' and 'Stmt', and merges it with recently introduced 'CompoundTensor'. A statement for the tensor is either passed directly to the Tensor constructor (akin to 'CompoundTensor'), or is built immediately in constructor. LoopNest is no longer responsible for constructing statements from tensors - it simply stitches already constructed statements contained in Tensors. This has a side effect that now we cannot construct several loopnests from the same tensors - we need to explicitly clone statements if we want to do that. A special copy constructor was added to LoopNest to make it more convenient (note: this only affects tests, we don't usually create multiple loopnests in other places). Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D26038223 Pulled By: ZolotukhinM fbshipit-source-id: 27a2e5900437cfb0c151e8f89815edec53608e17 --- benchmarks/cpp/tensorexpr/bench_reduce.cpp | 7 +- test/cpp/tensorexpr/test_llvm.cpp | 28 +++--- test/cpp/tensorexpr/test_loopnest.cpp | 29 ++---- test/cpp/tensorexpr/test_reductions.cpp | 50 +++++----- test/cpp/tensorexpr/tutorial.cpp | 77 ++++++++++++---- torch/csrc/jit/tensorexpr/ir_printer.cpp | 14 +-- torch/csrc/jit/tensorexpr/kernel.cpp | 6 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 24 +++-- torch/csrc/jit/tensorexpr/loopnest.h | 2 + torch/csrc/jit/tensorexpr/tensor.cpp | 95 +++++++++---------- torch/csrc/jit/tensorexpr/tensor.h | 102 ++++----------------- 11 files changed, 209 insertions(+), 225 deletions(-) diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index cd467d74162e..06bc9b055176 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -365,7 +366,8 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { te::For* mi = loops[1]; // TODO: rfactor works on the untransformed var set. This is a problem since we need to // look for the loop after Split to rfactor. - loop.rfactor(BT->body(), mi->var()); + auto bt_body = te::NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(bt_body, mi->var()); } loop.prepareForCodegen(); @@ -411,7 +413,8 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) { TORCH_CHECK(loops.size() == 2); te::For* mo = loops[0]; te::For* mi = loops[1]; - loop.rfactor(BT->body(), mi->var()); + auto bt_body = te::NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(bt_body, mi->var()); } { diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 7afb839dc7e0..343c965c294c 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -1,17 +1,17 @@ #ifdef TORCH_ENABLE_LLVM #include -#include "test/cpp/tensorexpr/test_base.h" - -#include "test/cpp/tensorexpr/padded_buffer.h" -#include "test/cpp/tensorexpr/test_utils.h" -#include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/ir.h" -#include "torch/csrc/jit/tensorexpr/ir_printer.h" -#include "torch/csrc/jit/tensorexpr/ir_simplifier.h" -#include "torch/csrc/jit/tensorexpr/llvm_codegen.h" -#include "torch/csrc/jit/tensorexpr/loopnest.h" -#include "torch/csrc/jit/tensorexpr/tensor.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -1481,7 +1481,8 @@ TEST(LLVM, RFactorReduction) { loops = loop.getLoopStmtsFor(b); loop_m = loops.at(2); loop_n = loops.at(1); - loop.rfactor(b->body(), loop_n->var(), loop_n->body()); + auto b_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(b_body, loop_n->var(), loop_n->body()); loop.prepareForCodegen(); Stmt* s = loop.root_stmt(); @@ -1522,7 +1523,8 @@ TEST(LLVM, RFactorVectorizedReduction) { For* loop_k = loops.at(0); For* loop_m = loops.at(1); For* loop_n = loops.at(2); - loopnest.rfactor(b->body(), loop_n->var()); + auto b_body = NodeFinder::find(loopnest.root_stmt())[0]; + loopnest.rfactor(b_body, loop_n->var()); loops = NodeFinder::find(loopnest.root_stmt()); loop_k = loops.at(0); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 74f69079e2db..6f44b5e2b033 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -826,7 +826,7 @@ TEST(LoopNest, ScheduleInlineSimple) { }); LoopNest l1({y}); - LoopNest l2({y}); + LoopNest l2(l1); l2.computeInline(x->buf()); l1.prepareForCodegen(); @@ -1156,7 +1156,7 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { } LoopNest l1({y}); - LoopNest l2({y}); + LoopNest l2(l1); l2.computeInline(x->buf()); l1.prepareForCodegen(); @@ -1228,7 +1228,6 @@ TEST(LoopNest, ScheduleSplitAThenInline) { return a->call(j + ExprHandle(8)); }); - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1247,7 +1246,6 @@ TEST(LoopNest, ScheduleSplitBThenInline) { return a->call(j + ExprHandle(8)); }); - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1275,8 +1273,6 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) { Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { return a->call(j + ExprHandle(8)); }); - - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1296,7 +1292,6 @@ TEST(LoopNest, ScheduleInlineThenSplit) { return a->call(j + ExprHandle(8)); }); - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1325,7 +1320,6 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) { return a->call(j + ExprHandle(8)); }); - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1357,7 +1351,6 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) { return a->call(j) - ExprHandle(1); }); - LoopNest loop({b}); For* i_outer; For* i_inner; @@ -1714,10 +1707,11 @@ TEST(LoopNest, LoopNestComputeAt_2) { c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); } } + LoopNest orig_loopnest({c}); { // First let's try to compute P at axis cy (the outer loop) - LoopNest l({c}); + LoopNest l(orig_loopnest); std::vector loops = l.getLoopStmtsFor(c); l.computeAt(l.getLoopBodyFor(p), loops[0]); l.prepareForCodegen(); @@ -1748,7 +1742,7 @@ TEST(LoopNest, LoopNestComputeAt_2) { } { // Now let's try to compute P at axis cx (the inner loop) - LoopNest l({c}); + LoopNest l(orig_loopnest); std::vector loops = l.getLoopStmtsFor(c); l.computeAt(l.getLoopBodyFor(p), loops[1]); l.prepareForCodegen(); @@ -1823,9 +1817,10 @@ TEST(LoopNest, LoopNestComputeAt_3) { } } + LoopNest orig_loopnest({D}); { // First let's try to compute A at axis dy (the outer loop) - LoopNest l({D}); + LoopNest l(orig_loopnest); std::vector loops = l.getLoopStmtsFor(D); l.computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); @@ -1861,7 +1856,7 @@ TEST(LoopNest, LoopNestComputeAt_3) { } { // Now let's try to compute A at axis dx (the inner loop) - LoopNest l({D}); + LoopNest l(orig_loopnest); std::vector loops = l.getLoopStmtsFor(D); l.computeAt(l.getLoopBodyFor(A), loops[1]); l.prepareForCodegen(); @@ -1897,10 +1892,6 @@ TEST(LoopNest, LoopNestComputeAt_3) { } } -TEST(LoopNest, LoopNestComputeAt_4) { - // TODO: Verify that computeAt works with reduction axis -} - class LoopOrderHelper : public IRVisitor { std::stringstream ordering; @@ -3668,7 +3659,7 @@ TEST(LoopNest, CompoundTensorSimple) { auto outer_for2 = For::make(x, 0, 10, inner_for2); Block* body = Block::make({outer_for1, outer_for2}); - Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body); + Tensor* A = new Tensor(a_buf.node(), body); LoopNest l({A}); l.prepareForCodegen(); @@ -3707,7 +3698,7 @@ TEST(LoopNest, CompoundTensorUsed) { auto outer_for2 = For::make(x, 0, 10, inner_for2); Block* body = Block::make({outer_for1, outer_for2}); - Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body); + Tensor* A = new Tensor(a_buf.node(), body); Tensor* B = Compute( "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return A->call(i, j + 1) + A->call(i, j + 2); diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index f69217df9bde..9c538741d9f4 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -259,9 +259,9 @@ TEST(Reductions, ReduceMax) { Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); - loop = LoopNest({m2d}); - loop.prepareForCodegen(); - s = loop.root_stmt(); + LoopNest loop2({m2d}); + loop2.prepareForCodegen(); + s = loop2.root_stmt(); s = IRSimplifier::simplify(s); SimpleIREvaluator cg2(s, {in2_, m2d}); @@ -372,9 +372,9 @@ TEST(Reductions, ReduceAnyAll) { }, {{10, "j"}}); - loop = LoopNest({allGreaterThan}); - loop.prepareForCodegen(); - s = loop.root_stmt(); + LoopNest loop2({allGreaterThan}); + loop2.prepareForCodegen(); + s = loop2.root_stmt(); s = IRSimplifier::simplify(s); SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue}); @@ -699,7 +699,8 @@ TEST(Reductions, ReduceRfactor) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(1)->var(); - loop.rfactor(c->body(), v); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -734,7 +735,8 @@ TEST(Reductions, Reduce3DRfactorInternal) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(1)->var(); - loop.rfactor(c->body(), v); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -769,7 +771,8 @@ TEST(Reductions, Reduce3DRfactorInner) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(2)->var(); - loop.rfactor(c->body(), v); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -804,7 +807,8 @@ TEST(Reductions, Reduce3DRfactorOuter) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(0)->var(); - loop.rfactor(c->body(), v); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -841,7 +845,8 @@ TEST(Reductions, Reduce3DRfactorWithOuter) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(3)->var(); - loop.rfactor(c->body(), v); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -870,12 +875,13 @@ TEST(Reductions, Reduce3DRfactorRepeated) { } Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); + LoopNest orig_loopnest({c}); for (int rVar1 = 0; rVar1 < 3; ++rVar1) { for (int rVar2 = 0; rVar2 < 2; ++rVar2) { std::vector out(1, -1.f); - LoopNest loop({c}); + LoopNest loop(orig_loopnest); auto reduces = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(reduces.size(), 1); auto v1 = reduces[0]->reduce_args()[rVar1]; @@ -921,7 +927,8 @@ TEST(Reductions, ReduceRfactorInsertionPoint) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(0)->var(); - loop.rfactor(c->body(), v, loops.at(0)->body()); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v, loops.at(0)->body()); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -956,7 +963,8 @@ TEST(Reductions, Reduce3DRfactorInsertionPoint) { LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto v = loops.at(1)->var(); - loop.rfactor(c->body(), v, loops.at(1)->body()); + auto c_body = NodeFinder::find(loop.root_stmt())[0]; + loop.rfactor(c_body, v, loops.at(1)->body()); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); loop.prepareForCodegen(); @@ -985,13 +993,12 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { in_, {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}}); LoopNest refloop({c}); + LoopNest loop(refloop); refloop.prepareForCodegen(); SimpleIREvaluator ref_cg( IRSimplifier::simplify(refloop.root_stmt()), {in_, c}); ref_cg.call({in, ref}); - LoopNest loop({c}); - // rfactor out "c". auto reduces = NodeFinder::find(loop.root_stmt()); loop.rfactor(reduces[0], reduces[0]->reduce_args()[3]); @@ -1373,7 +1380,7 @@ TEST(Reductions, ReduceInlineConsumer) { } LoopNest l1({y}); - LoopNest l2({y}); + LoopNest l2(l1); l2.computeInline(x->buf()); l1.prepareForCodegen(); @@ -1431,7 +1438,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { } LoopNest l1({y}); - LoopNest l2({y}); + LoopNest l2(l1); l2.computeInline(x->buf()); l1.prepareForCodegen(); @@ -1863,11 +1870,11 @@ TEST(Reductions, ReductionVectorize) { Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); LoopNest l_before({tensor}); + LoopNest l(l_before); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor}); cg_before.call({in_, out_before}); - LoopNest l({tensor}); l.vectorize(l.getLoopStmtsFor(tensor)[0]); Stmt* s = l.root_stmt(); @@ -1923,11 +1930,11 @@ TEST(Reductions, ReductionVectorizeRfactor) { Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}}); LoopNest l_before({tensor}); + LoopNest l(l_before); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor}); cg_before.call({in_, out_before}); - LoopNest l({tensor}); ASSERT_THROWS_WITH( l.vectorize(l.getLoopStmtsFor(tensor)[1]), "reduction axis"); @@ -1935,7 +1942,8 @@ TEST(Reductions, ReductionVectorizeRfactor) { // loop. std::vector loops = l.getLoopStmtsFor(tensor); auto v = loops.at(1)->var(); - l.rfactor(tensor->body(), v); + auto tensor_body = NodeFinder::find(l.root_stmt())[0]; + l.rfactor(tensor_body, v); loops = NodeFinder::find(l.root_stmt()); l.vectorize(loops[2]); diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 31e05549186e..b935e5d2b6b6 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -118,33 +118,53 @@ int main(int argc, char* argv[]) { std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl; { - // A tensor computation is represented by objects of Tensor class and + // A tensor computation is represented by Tensor class objects and // consists of the following pieces: // - domain, which is specified by a Buf expression - // - an expression (or several expressions if we want to perform several - // independent computations over the same domain) for its elements, as a - // function of indices - // - // TODO: Update this section once Tensor/Function cleanup is done + // - a tensor statement, specified by a Stmt object, that computation to + // be performed in this domain + + // Let's start with defining a domain. We do this by creating a Buf object. + + // First, let's specify the sizes: std::vector dims = { new IntImm(64), new IntImm(32)}; // IntImm stands for Integer Immediate // and represents an integer constant - // Next we need to create arguments. The arguments are Vars, and they play - // role of placeholders. The computation that the tensor would describe - // would use these arguments. + // Now we can create a Buf object by providing a name, dimensions, and a + // data type of the elements: + const Buf* buf = new Buf("X", dims, kInt); + + // Next we need to spefify the computation. We can do that by either + // constructing a complete tensor statement for it (statements are + // examined in details in subsequent section), or by using a convenience + // method where we could specify axis and an element expression for the + // computation. In the latter case a corresponding statement would be + // constructed automatically. + + // Let's define two variables, i and j - they will be axis in our + // computation. const Var* i = new Var("i", kInt); const Var* j = new Var("j", kInt); std::vector args = {i, j}; // Now we can define the body of the tensor computation using these - // arguments. + // variables. What this means is that values in our tensor are: + // X[i, j] = i * j Expr* body = new Mul(i, j); // Finally, we pass all these pieces together to Tensor constructor: - Tensor* X = new Tensor("X", dims, args, body); + Tensor* X = new Tensor(buf, args, body); std::cout << "Tensor computation: " << *X << std::endl; - // Prints: Tensor computation: Tensor X(i[64], j[32]) = i * j + // Prints: + // Tensor computation: Tensor X[64, 32]: + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // X[i, j] = i * j; + // } + // } + + // TODO: Add an example of constructing a Tensor with a complete Stmt. // Similarly to how we provide a more convenient way of using handles for // constructing Exprs, Tensors also have a more convenient API for @@ -155,11 +175,17 @@ int main(int argc, char* argv[]) { {{64, "i"}, {32, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i / j; }); std::cout << "Tensor computation: " << *Z << std::endl; - // Prints: Tensor computation: Tensor Z(i[64], j[32]) = i / j + // Prints: + // Tensor computation: Tensor Z[64, 32]: + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // Z[i, j] = i / j; + // } + // } // Tensors might access other tensors and external placeholders in their // expressions. It can be done like so: - Placeholder P("P", kFloat, {64, 32}); + Placeholder P("P", kInt, {64, 32}); Tensor* R = Compute( "R", {{64, "i"}, {32, "j"}}, @@ -167,7 +193,13 @@ int main(int argc, char* argv[]) { return Z->call(i, j) * P.load(i, j); }); std::cout << "Tensor computation: " << *R << std::endl; - // Prints: Tensor computation: Tensor R(i[64], j[32]) = Z(i, j) * P[i, j] + // Prints: + // Tensor computation: Tensor R[64, 32]: + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // R[i, j] = (Z(i, j)) * (P[i, j]); + // } + // } // Placeholders could be thought of as external tensors, i.e. tensors for // which we don't have the element expression. In other words, for `Tensor` @@ -211,8 +243,19 @@ int main(int argc, char* argv[]) { std::cout << "Tensor computation X: " << *X << "Tensor computation Y: " << *Y << std::endl; // Prints: - // Tensor computation X: Tensor X(i[64], j[32]) = (A[i, j]) + (B[i, j]) - // Tensor computation Y: Tensor Y(i[64], j[32]) = sigmoid(X(i, j)) + // Tensor computation X: Tensor X[64, 32]: + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // X[i, j] = (A[i, j]) + (B[i, j]); + // } + // } + + // Tensor computation Y: Tensor Y[64, 32]: + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // Y[i, j] = sigmoid(X(i, j)); + // } + // } // Creating a loop nest is as quite simple, we just need to specify what are // the output tensors in our computation and LoopNest object will diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 1df2f96671df..96fb11d3a982 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -596,19 +596,15 @@ std::string to_string(const Tensor* t) { return "(null tensor)\n"; } std::ostringstream oss; - if (!t->body()) { - oss << "Tensor " << t->buf()->name_hint() << " = " << *t->ElementStmt() - << "\n"; - return oss.str(); - } - oss << "Tensor " << t->buf()->name_hint() << "("; - for (size_t i = 0; i < t->ndim(); i++) { + // TODO: move this to Buf printer + oss << "Tensor " << t->buf()->name_hint() << "["; + for (size_t i = 0; i < t->buf()->ndim(); i++) { if (i != 0) { oss << ", "; } - oss << *t->arg(i) << "[" << *t->dim(i) << "]"; + oss << *t->buf()->dim(i); } - oss << ") = " << *t->body() << "\n"; + oss << "]:\n" << *t->stmt() << "\n"; return oss.str(); } } // namespace std diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 5a727cc5a92e..1da0ab8beefb 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -119,7 +119,7 @@ size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) { } static at::ScalarType tensorType(Tensor* t) { - return static_cast(t->body()->dtype().scalar_type()); + return static_cast(t->buf()->dtype().scalar_type()); } static std::vector computeIndicesToBroadcast( @@ -608,7 +608,7 @@ std::vector TensorExprKernel::valueShape( if (it == tensors_.end()) { return {}; } - return ExprVectorToExprHandleVector(it->second->dims()); + return ExprVectorToExprHandleVector(it->second->buf()->dims()); } Tensor* TensorExprKernel::computeOneOperand( @@ -1125,7 +1125,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::type_as: { auto const& n = v->node(); Tensor* rhs = tensors_.at(n->inputs()[1]->unique()); - auto dtype = rhs->body()->dtype(); + auto dtype = rhs->buf()->dtype(); return computeOneOperand( "aten_type_as", v, [dtype](const ExprHandle& lhs) { return Cast::make(dtype, lhs); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 6e63743496cf..7d6e208291ba 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -24,6 +24,11 @@ namespace torch { namespace jit { namespace tensorexpr { +LoopNest::LoopNest(const LoopNest& other) + : root_stmt_(Stmt::clone(other.root_stmt_)), + output_bufs_(other.output_bufs_), + intermediate_bufs_(other.intermediate_bufs_) {} + class FunctionCallUseCount : public IRVisitor { public: std::unordered_map findUses(Stmt* s) { @@ -424,11 +429,7 @@ class DepTracker : public IRVisitor { public: std::vector findUsedTensors(Tensor* tensor) { used_tensors.clear(); - if (tensor->body()) { - tensor->body()->accept(this); - } else { - tensor->ElementStmt()->accept(this); - } + tensor->stmt()->accept(this); return used_tensors; } @@ -508,7 +509,12 @@ LoopNest::LoopNest(const std::vector& output_tensors) { std::vector loops; for (Tensor* t : tensors_to_compute) { - Stmt* loop = t->lowerToStmt(); + Stmt* loop = t->stmt(); + if (loop->get_parent()) { + std::cerr << "Error: creating a loopnest from already used Tensors\n"; + loops = {}; + break; + } // Flatten initializers. if (Block* block = dynamic_cast(loop)) { for (auto* s : block->stmts()) { @@ -544,6 +550,7 @@ class FunctionInliner : public IRMutator { throw std::logic_error("cannot inline Buf with compound indices"); } index_vars_.insert(index_var); + producer_index_vars_.push_back(index_var); } } @@ -563,9 +570,9 @@ class FunctionInliner : public IRMutator { } std::vector index_vars; - TORCH_INTERNAL_ASSERT(buf->ndim() == t->args().size()); + TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size()); for (size_t i = 0; i < buf->ndim(); i++) { - const Var* func_callee_arg = dynamic_cast(t->arg(i)); + const Var* func_callee_arg = producer_index_vars_.at(i); const Expr* func_caller_param = v->param(i); auto iter = inline_mapping_.find(func_callee_arg); if (iter != inline_mapping_.end()) { @@ -686,6 +693,7 @@ class FunctionInliner : public IRMutator { // Index Vars present in the producer. std::unordered_set index_vars_; + std::vector producer_index_vars_; std::unordered_map inline_mapping_; diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index 1bff2e96d415..7c27ca6968a5 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -37,6 +37,8 @@ class TORCH_API LoopNest { output_bufs_(output_bufs), intermediate_bufs_(intermediate_bufs) {} + LoopNest(const LoopNest& other); + Stmt* root_stmt() const { return root_stmt_; } diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index e32f9f293940..3eec21d13f0b 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -8,56 +8,60 @@ namespace torch { namespace jit { namespace tensorexpr { -Stmt* Tensor::lowerToStmt() const { - Stmt* s = ElementStmt(); +Stmt* Tensor::constructStmt( + const std::vector& args, + const Expr* body, + const std::vector& reduce_dims, + const std::vector& reduce_args) const { + std::vector indices(args.begin(), args.end()); - // If this Tensor has no functional body, it already has its axes expanded. - if (nullptr == body()) { - return s; - } + const Expr* mask = new IntImm(1); + Stmt* s = new Store(buf_, indices, body, mask); - if (ndim() == 0 && reduce_ndim() == 0) { + size_t ndim = buf()->ndim(); + size_t reduce_ndim = reduce_dims.size(); + + if (ndim == 0 && reduce_ndim == 0) { return s; } const Expr* init_expr = buf()->initializer(); - std::vector indices(args().begin(), args().end()); - - if (reduce_ndim() > 0) { - for (size_t i = 0; i < reduce_ndim(); i++) { + if (reduce_ndim > 0) { + for (size_t i = 0; i < reduce_ndim; i++) { // Going in reverse order: from innermost loop to the outermost - size_t dim_index = reduce_ndim() - i - 1; + size_t dim_index = reduce_ndim - i - 1; s = new For( - reduce_arg(dim_index), new IntImm(0), reduce_dim(dim_index), s); + reduce_args[dim_index], new IntImm(0), reduce_dims[dim_index], s); } if (init_expr) { - Store* init = new Store(buf(), indices, init_expr, new IntImm(1)); - s = new Block({init, s}); + Store* init_stmt = new Store(buf(), indices, init_expr, new IntImm(1)); + s = new Block({init_stmt, s}); } } - for (size_t i = 0; i < ndim(); i++) { + for (size_t i = 0; i < ndim; i++) { // Going in reverse order: from innermost loop to the outermost - size_t dim_index = ndim() - i - 1; - s = new For(arg(dim_index), new IntImm(0), dim(dim_index), s); + size_t dim_index = ndim - i - 1; + s = new For(args[dim_index], new IntImm(0), buf()->dim(dim_index), s); } return s; } Tensor* Compute( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const std::function&)>& body_func) { std::vector dims; std::vector args; unpack_dim_args(dim_args, &dims, &args); const Expr* body = body_func(VarVectorToVarHandleVector(args)).node(); - return new Tensor(func_name, dims, args, body); + const Buf* buf = new Buf(name, dims, body->dtype()); + return new Tensor(buf, args, body); } Tensor* Compute( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const std::function& body_func) { if (dim_args.size() != 1) { @@ -68,11 +72,12 @@ Tensor* Compute( std::vector args; unpack_dim_args(dim_args, &dims, &args); const Expr* body = body_func(VarHandle(args[0])).node(); - return new Tensor(func_name, dims, args, body); + const Buf* buf = new Buf(name, dims, body->dtype()); + return new Tensor(buf, args, body); } Tensor* Compute( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const std::function& body_func) { @@ -83,11 +88,12 @@ Tensor* Compute( std::vector args; unpack_dim_args(dim_args, &dims, &args); const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node(); - return new Tensor(func_name, dims, args, body); + const Buf* buf = new Buf(name, dims, body->dtype()); + return new Tensor(buf, args, body); } Tensor* Compute( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const std::function< ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>& @@ -101,11 +107,12 @@ Tensor* Compute( const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2])) .node(); - return new Tensor(func_name, dims, args, body); + const Buf* buf = new Buf(name, dims, body->dtype()); + return new Tensor(buf, args, body); } Tensor* Compute( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const std::function dims; - std::vector args_nodes; - unpack_dim_args(dim_args, &dims, &args_nodes); - auto args = VarVectorToVarHandleVector(args_nodes); - const Expr* body = body_func(args[0], args[1], args[2], args[3]).node(); - return new Tensor(func_name, dims, args_nodes, body); -} - -Stmt* Tensor::ElementStmt() const { - std::vector indices; - for (size_t i = 0; i < buf_->ndim(); i++) { - indices.push_back(args_[i]); - } - - const Expr* mask = new IntImm(1); - Stmt* update_stmt = new Store(buf_, indices, body_, mask); - return update_stmt; + std::vector args; + unpack_dim_args(dim_args, &dims, &args); + const Expr* body = body_func( + VarHandle(args[0]), + VarHandle(args[1]), + VarHandle(args[2]), + VarHandle(args[3])) + .node(); + const Buf* buf = new Buf(name, dims, body->dtype()); + return new Tensor(buf, args, body); } Tensor* Reduce( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const Reducer& reducer, const Placeholder& buffer, const std::vector& reduce_args) { return Reduce( - func_name, + name, dim_args, reducer, [&](ParameterList& p) { return buffer.load(p); }, @@ -149,13 +150,13 @@ Tensor* Reduce( } Tensor* Reduce( - const std::string& func_name, + const std::string& name, const std::vector& dim_args, const Reducer& reducer, Tensor* tensor, const std::vector& reduce_args) { return Reduce( - func_name, + name, dim_args, reducer, [&](ParameterList& p) { return tensor->call(p); }, diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index 31454f745944..609b5c30a839 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -14,17 +14,10 @@ namespace tensorexpr { class TORCH_API Tensor : KernelScopedObject { public: - Tensor( - const std::string& name, - const std::vector& dims, - const std::vector& args, - const Expr* body) - // TODO: Function should not create buffers, they should be created - // manually before constructing a function. - : buf_(new Buf(name, dims, body->dtype())), args_(args), body_(body) {} - - Tensor(Buf* buf, const std::vector& args, const Expr* body) - : buf_(buf), args_(args), body_(body) {} + Tensor(const Buf* buf, const std::vector& args, const Expr* body) + : buf_(buf) { + stmt_ = constructStmt(args, body, {}, {}); + } Tensor( const Buf* buf, @@ -32,65 +25,19 @@ class TORCH_API Tensor : KernelScopedObject { const std::vector& reduce_dims, const std::vector& reduce_args, const Expr* body) - : buf_(buf), - args_(args), - body_(body), - reduce_dims_(reduce_dims), - reduce_args_(reduce_args) {} + : buf_(buf) { + stmt_ = constructStmt(args, body, reduce_dims, reduce_args); + } - virtual ~Tensor() {} + Tensor(const Buf* buf, Stmt* stmt) : buf_(buf), stmt_(stmt) {} - // Wrappers over accessors to fields of the underlying function - const Expr* body() const { - return body_; - } const Buf* buf() const { return buf_; } - size_t ndim() const { - return buf()->ndim(); - } - const Expr* dim(size_t index) const { - if (index >= ndim()) { - throw out_of_range_index(); - } - return buf()->dim(index); - } - std::vector dims() const { - return buf()->dims(); - } - const Var* arg(size_t index) const { - if (index >= ndim()) { - throw out_of_range_index(); - } - return args_[index]; - } - const std::vector& args() const { - return args_; - } - size_t reduce_ndim() const { - return reduce_dims_.size(); - } - std::vector reduce_dims() const { - return reduce_dims_; - } - std::vector reduce_args() const { - return reduce_args_; - } - const Expr* reduce_dim(size_t index) const { - if (index >= reduce_ndim()) { - throw out_of_range_index(); - } - return reduce_dims_[index]; - } - const Var* reduce_arg(size_t index) const { - if (index >= reduce_ndim()) { - throw out_of_range_index(); - } - return reduce_args_[index]; - } - virtual Stmt* ElementStmt() const; + Stmt* stmt() const { + return stmt_; + } template inline ExprHandle operator()(const Ts&... ts); @@ -99,31 +46,14 @@ class TORCH_API Tensor : KernelScopedObject { template inline ExprHandle call(const Ts&... ts); - Stmt* lowerToStmt() const; - private: - const Buf* buf_; - std::vector args_; - const Expr* body_; - std::vector reduce_dims_; - std::vector reduce_args_; -}; - -class TORCH_API CompoundTensor : public Tensor { - public: - CompoundTensor( - const Buf* buf, + Stmt* constructStmt( const std::vector& args, - Stmt* stmt) - : Tensor(buf, args, {}, {}, nullptr), stmt_(stmt) {} - - virtual ~CompoundTensor() {} - - Stmt* ElementStmt() const override { - return stmt_; - } + const Expr* body, + const std::vector& reduce_dims, + const std::vector& reduce_args) const; - private: + const Buf* buf_; Stmt* stmt_; }; From dc2a44c4fc5f5efba16b8567ab970f8bcf1fe007 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 27 Jan 2021 16:47:01 -0800 Subject: [PATCH 25/41] Back out "Revert D25850783: Add torch::deploy, an embedded torch-python interpreter" (#51124) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51124 Original commit changeset: 1c7133627da2 Test Plan: Test locally with interpreter_test and on CI Reviewed By: suo Differential Revision: D26077905 fbshipit-source-id: fae83bf9822d79e9a9b5641bc5191a7f3fdea78d --- .github/workflows/lint.yml | 6 + .gitignore | 3 + .jenkins/pytorch/build.sh | 11 + .jenkins/pytorch/test.sh | 8 + CMakeLists.txt | 5 + torch/__init__.py | 10 +- torch/_ops.py | 3 +- torch/_utils_internal.py | 16 +- torch/csrc/Module.cpp | 2 + torch/csrc/deploy/.gitignore | 1 + torch/csrc/deploy/CMakeLists.txt | 3 + torch/csrc/deploy/README.md | 10 + torch/csrc/deploy/example/simple.pt | Bin 0 -> 2432 bytes torch/csrc/deploy/example/trace_simple.py | 20 ++ torch/csrc/deploy/interpreter/CMakeLists.txt | 115 +++++++ .../deploy/interpreter/CMakePythonModules.txt | 69 ++++ torch/csrc/deploy/interpreter/freeze.py | 269 +++++++++++++++ .../deploy/interpreter/hide_symbols.script | 5 + torch/csrc/deploy/interpreter/interpreter.cpp | 324 ++++++++++++++++++ torch/csrc/deploy/interpreter/interpreter.h | 67 ++++ .../deploy/interpreter/interpreter_impl.h | 26 ++ torch/csrc/deploy/interpreter/test_main.cpp | 49 +++ .../deploy/interpreter/third_party/README.md | 2 + torch/cuda/__init__.py | 4 + torch/utils/__init__.py | 8 +- 25 files changed, 1024 insertions(+), 12 deletions(-) create mode 100644 torch/csrc/deploy/.gitignore create mode 100644 torch/csrc/deploy/CMakeLists.txt create mode 100644 torch/csrc/deploy/README.md create mode 100644 torch/csrc/deploy/example/simple.pt create mode 100644 torch/csrc/deploy/example/trace_simple.py create mode 100644 torch/csrc/deploy/interpreter/CMakeLists.txt create mode 100644 torch/csrc/deploy/interpreter/CMakePythonModules.txt create mode 100644 torch/csrc/deploy/interpreter/freeze.py create mode 100644 torch/csrc/deploy/interpreter/hide_symbols.script create mode 100644 torch/csrc/deploy/interpreter/interpreter.cpp create mode 100644 torch/csrc/deploy/interpreter/interpreter.h create mode 100644 torch/csrc/deploy/interpreter/interpreter_impl.h create mode 100644 torch/csrc/deploy/interpreter/test_main.cpp create mode 100644 torch/csrc/deploy/interpreter/third_party/README.md diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 54acbe7b1c6a..9c215540108b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -170,6 +170,8 @@ jobs: # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed # in a follow up PR. # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built. + # deploy/interpreter files are excluded due to using macros and other techniquies + # that are not easily converted to accepted c++ python tools/clang_tidy.py \ --verbose \ --paths torch/csrc/ \ @@ -186,6 +188,10 @@ jobs: -g"-torch/csrc/autograd/FunctionsManual.cpp" \ -g"-torch/csrc/generic/*.cpp" \ -g"-torch/csrc/jit/codegen/cuda/runtime/*" \ + -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \ + -g"-torch/csrc/deploy/interpreter/interpreter.h" \ + -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \ + -g"-torch/csrc/deploy/interpreter/test_main.cpp" \ "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt diff --git a/.gitignore b/.gitignore index e1fe94cb9bf9..a3a832ce7555 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,9 @@ torch/csrc/autograd/generated/* torch/testing/_internal/generated/annotated_fn_args.py torch/testing/_internal/data/*.pt torch/csrc/cudnn/cuDNN.cpp +torch/csrc/deploy/interpreter/cpython +torch/csrc/deploy/interpreter/frozen +torch/csrc/deploy/interpreter/third_party/typing_extensions.py torch/csrc/generated torch/csrc/generic/TensorMethods.cpp torch/csrc/jit/generated/* diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index fad9c8e49e64..dfd359c1ddf4 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -23,6 +23,17 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@" fi +if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then + # Enabling DEPLOY build (embedded torch python interpreter, experimental) + # only on one config for now, can expand later + export USE_DEPLOY=ON + + # Deploy feature builds cpython. It requires these packages. + # TODO move this to dockerfile? + sudo apt-get -qq update + sudo apt-get -qq install libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev +fi + echo "Python version:" python --version diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 73563f145eb8..d70a377ec086 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -354,6 +354,11 @@ test_vec256() { fi } +test_torch_deploy() { + SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test + assert_git_not_dirty +} + if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -371,6 +376,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then + if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then + test_torch_deploy + fi install_torchvision test_python_shard1 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index a23208752afb..c138f261c27b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -919,3 +919,8 @@ endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() + +# ---[ Torch Deploy +if(USE_DEPLOY) + add_subdirectory(torch/csrc/deploy) +endif() diff --git a/torch/__init__.py b/torch/__init__.py index 3f9df8bc009a..f27af91eb493 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -22,7 +22,11 @@ from ._utils import _import_dotted_name from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \ USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS -from .version import __version__ +# TODO(torch_deploy) figure out how to freeze version.py in fbcode build +if sys.executable == 'torch_deploy': + __version__ = "torch-deploy-1.8" +else: + from .version import __version__ from ._six import string_classes as _string_classes from typing import Set, Type, TYPE_CHECKING @@ -134,7 +138,7 @@ # See Note [Global dependencies] def _load_global_deps(): - if platform.system() == 'Windows': + if platform.system() == 'Windows' or sys.executable == 'torch_deploy': return lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so') @@ -516,7 +520,7 @@ class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase): ################################################################################ def manager_path(): - if platform.system() == 'Windows': + if platform.system() == 'Windows' or sys.executable == 'torch_deploy': return b"" path = get_file_path('torch', 'bin', 'torch_shm_manager') prepare_multiprocessing_environment(get_file_path('torch')) diff --git a/torch/_ops.py b/torch/_ops.py index dd0c8cd19fde..96c8baac7838 100644 --- a/torch/_ops.py +++ b/torch/_ops.py @@ -2,7 +2,6 @@ import contextlib import ctypes -import os import sys import types @@ -67,7 +66,7 @@ def __getattr__(self, op_name): return op class _Ops(types.ModuleType): - __file__ = os.path.join(os.path.dirname(__file__), '_ops.py') + __file__ = '_ops.py' def __init__(self): super(_Ops, self).__init__('torch.ops') diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py index be7d8fcaa685..c77e960ae659 100644 --- a/torch/_utils_internal.py +++ b/torch/_utils_internal.py @@ -1,6 +1,7 @@ import os import inspect +import sys import tempfile # this arbitrary-looking assortment of functionality is provided here @@ -8,11 +9,16 @@ # use is the FB build environment, where this source file is replaced # by an equivalent. -if os.path.basename(os.path.dirname(__file__)) == 'shared': - torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +if sys.executable == 'torch_deploy': + # __file__ is meaningless in the context of frozen torch used in torch deploy. + # setting empty torch_parent should allow below functions to operate without crashing, + # but it's unclear if there is a valid use case for them in the context of deploy. + torch_parent = "" else: - torch_parent = os.path.dirname(os.path.dirname(__file__)) - + if os.path.basename(os.path.dirname(__file__)) == 'shared': + torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + else: + torch_parent = os.path.dirname(os.path.dirname(__file__)) def get_file_path(*path_components): return os.path.join(torch_parent, *path_components) @@ -60,7 +66,7 @@ def get_source_lines_and_file(obj, error_msg=None): TEST_MASTER_ADDR = '127.0.0.1' TEST_MASTER_PORT = 29500 -# USE_GLOBAL_DEPS controls whether __init__.py tries to load +# USE_GLOBAL_DEPS controls whether __init__.py tries to load # libtorch_global_deps, see Note [Global dependencies] USE_GLOBAL_DEPS = True # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index eb80fff32b81..bbd3ccef505f 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -692,6 +692,8 @@ extern "C" #ifdef _WIN32 __declspec(dllexport) #endif +TORCH_API PyObject* initModule(); +// separate decl and defn for msvc error C2491 PyObject* initModule() { HANDLE_TH_ERRORS at::internal::lazy_init_num_threads(); diff --git a/torch/csrc/deploy/.gitignore b/torch/csrc/deploy/.gitignore new file mode 100644 index 000000000000..aa484a97a20f --- /dev/null +++ b/torch/csrc/deploy/.gitignore @@ -0,0 +1 @@ +example/generated/* diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt new file mode 100644 index 000000000000..9da314905860 --- /dev/null +++ b/torch/csrc/deploy/CMakeLists.txt @@ -0,0 +1,3 @@ +set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}") + +add_subdirectory(interpreter) diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md new file mode 100644 index 000000000000..4fab5aa4ef56 --- /dev/null +++ b/torch/csrc/deploy/README.md @@ -0,0 +1,10 @@ +# Torch Deploy +This is an experimental feature to embed multiple python interpreters inside the torch library, +providing a solution to the 'GIL problem' for multithreading with the convenience of python +and eager or torchscripted pytorch programs. + +# libinterpreter +This is an internal library used behind the scenes to enable multiple python interpreters in +a single deploy runtime. libinterpreter.so is DLOPENed multiple times by the deploy library. +Each copy of libinterpreter exposes a simple interpreter interface but hides its python and other +internal symbols, preventing the different python instances from seeing each other. diff --git a/torch/csrc/deploy/example/simple.pt b/torch/csrc/deploy/example/simple.pt new file mode 100644 index 0000000000000000000000000000000000000000..50f9a087aa822821647a8acabe28bb84207475b2 GIT binary patch literal 2432 zcmah~4Ny~87JmE%jN!K_C{c`n7DSRLExN!vhoWYYRE&s2)-FvT36Ky7`GEqhO3~?p zL|_?ZsctLdMyZHOE5mjH5kaV>s1(uBHlpj!S}e8F-KAEovhVR{g6sCo+_~?)d%knu z`R@776^k4k3BtvNc+OmjmkEO^D@UW`D^iRpe1R~=lh``jhIZRyK=*I)!cZZq8){(n z@pmC?B^ynOG?YYeu>YNUa`b@-j{Wg<^dcXFL{|=d_n*LpxJB4{DiZ_0cLHYUD8#+K z1H*4J@SRPF(w;cn*y4?rHxI+^Z-0Txury4*pNcYmEbge-j72SNa8-N+niN~G_qF}N z<@_7^%N(&n#=@$r$*7cjV)N&FVM)6^KJ=2I;OA!2;OT%JqWUNghic$ze6Z|;`Ixk4 zK3-5SLHR&3X6&{=+nIhCDBc5>@l4=GrIPiSf`i2ftn~r#S$8M-R@H4N@;w^W9&!18t;$kz&B73@dhIeC+LVs` zlLq9;xM(`F6k0QwC>~BDzdtq({_3B=an%TTuO5a^Bl;k4EEHMiO5sOy27bIifQgN2 zB-}WWA20bC8nwOTsGTFcx{C#kmHi}KbDNxWcOnTn)#y)>Gs-O0PW1JTh zx_jV>{nvnD3B?xEILwnAALMW+lz(s#$;PQ1cyw7?Ix5xsJ`Ii&$ zMoAm|aArNmfEhSP6EJ$c5-*z~F>S0H@&#F7Svm^Qi5Wn6X(N+=iojq=D4xeZ!0GNS z_)UBu4zBgWo|~V*DY*)xx09rY-*HlLX#)}-b5IW_z$M%b<64iwf$(M6o6`(+8@-=5os3(8l*-#T1TEOK_~{^#dUxef$D5Ah52=1v4y>y4L_ zsS(C3dR}q13M{ghOQlAwUY;S9GB*@#&?<5@N~Vqoes5clgh7e=ZKWzb!>F^9PdM?U zxki=7z?ABhskthRLTXfI8?<_9-b$vgghr##}J)2Zqhl6h#(PD#5M+r*djY)iCnJLXq0lJN}Fw9ZqzH3dZl8m zN^WH8T!Isf3A#BFy5|`6DXMH$cDjyPlAEfV%cJS)jB2R3M6J<-ri)Frr|0|AJR)S8 zF1c2ryq!>>$|8?x(K!lPgZp|mzPV4UDpD?SS2)b9a*EAyjQ>UMh zppHCUI#E_GGxU(y`Uv~?R zH22tDTQeU2<}#k8qp-i&eDsqR-y4#3*9XQpA0E7W)HCUK1BpX}Y@ZMJ|J~a?yR+=^ z3hfWqUuo3J@RDhN-3~f$-}Idi{!HaOFqQZJzN^@T|FgWa4aSsgqhV4&elwX)pSbN@ zWFI_HbXoj4Nz_G1(Bs7#bt9su_sLW04ODFlW1@a#2iyH-zlAaBEWi3FeTzM{Gd;v2 zJ9?VAldZdlE4^T2HaGNasI`*ZZD2W+c&(r`SCimpVzaKec{bpK6iHLy=cG3~iuL*V z+eB${dU2FzwK_W`>vLk;#A~3s-OOKE)xhis)?B;UL|{|hZz{sn>54@R#_}m2KZnp6 pZ@RJ4Ha(l@o=900-9IU#a1&2{b0**EWwFR/configure --prefix ${PYTHON_INSTALL_DIR} + BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8 + INSTALL_COMMAND make install + BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN} + LOG_OUTPUT_ON_FAILURE True +) + +# We find the built python modules, this is confusing because python build already outputs +# the modules in a strange nested path, and then that path is relative to the +# Cmake ExternalProject root in the cmake build dir. +ExternalProject_Get_property(cpython SOURCE_DIR) +SET(PYTHON_MODULE_DIR "${SOURCE_DIR}/build/temp.linux-x86_64-3.8/${SOURCE_DIR}/Modules") +SET(PYTHON_STDLIB_DIR "${SOURCE_DIR}/Lib") +SET(PYTHON_STDLIB "${PYTHON_INSTALL_DIR}/lib/libpython_stdlib3.8.a") +# Then we use a hardcoded list of expected module names and include them in our lib +include("CMakePythonModules.txt") +ExternalProject_Add_Step( + cpython + archive_stdlib + DEPENDEES install + BYPRODUCTS ${PYTHON_STDLIB} + COMMAND ar -rc ${PYTHON_STDLIB} ${PYTHON_MODULES} + VERBATIM +) +# Get python typing extension, needed by torch +SET(TYPING_PKG "${INTERPRETER_DIR}/third_party/typing_extensions.py") +ExternalProject_Add( + typing + PREFIX typing + GIT_REPOSITORY https://github.com/python/typing.git + GIT_TAG 3.7.4.3 + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND cp ../typing/typing_extensions/src_py3/typing_extensions.py ${TYPING_PKG} + BYPRODUCTS ${TYPING_PKG} + LOG_OUTPUT_ON_FAILURE True +) + +# Output files generated by freeze script, containing frozen bytecode +SET(FROZEN_DIR "${INTERPRETER_DIR}/frozen") +set(FROZEN_FILES + ${FROZEN_DIR}/main.c + ${FROZEN_DIR}/bytecode_0.c + ${FROZEN_DIR}/bytecode_1.c + ${FROZEN_DIR}/bytecode_2.c + ${FROZEN_DIR}/bytecode_3.c + ${FROZEN_DIR}/bytecode_4.c +) +# Packages to freeze: python stdlib, typing extension, and torch +add_custom_command( + OUTPUT ${FROZEN_FILES} + WORKING_DIRECTORY ${INTERPRETER_DIR} + COMMAND mkdir -p ${FROZEN_DIR} + COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose + DEPENDS cpython typing + VERBATIM +) + +# instantiate a library based on the objects that make up torch_python +# make sure system python isn't used here +target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR}) +add_library(torch_python_static STATIC $) +# Build the interpreter lib, designed to be standalone and dlopened +# We bake the python and torch_python binding objs into libinterpreter +set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script") +set(INTERPRETER_LIB_SOURCES + ${INTERPRETER_DIR}/interpreter.cpp + ${FROZEN_FILES} + ${LINKER_SCRIPT} +) +add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT}) +set_property(TARGET interpreter APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}") +# need to ensure headers are present before any .cpp in interpreter are compiled, +# but cpp themselves don't clearly depend on cpython so there is a race otherwise +add_dependencies(interpreter cpython) +target_compile_options( + interpreter PRIVATE + -fvisibility=hidden +) +target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR}) +target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR}) +target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static) +target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins +target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite) + +# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib +set(INTERPRETER_TEST_SOURCES + ${INTERPRETER_DIR}/test_main.cpp +) +add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES}) +target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch) +target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR}) +target_link_libraries(interpreter_test PUBLIC gtest dl) +# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen +# dependencies for libinterpreter, regardless of whether they are used in interpreter_test +target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite) diff --git a/torch/csrc/deploy/interpreter/CMakePythonModules.txt b/torch/csrc/deploy/interpreter/CMakePythonModules.txt new file mode 100644 index 000000000000..c6bc9cab76ff --- /dev/null +++ b/torch/csrc/deploy/interpreter/CMakePythonModules.txt @@ -0,0 +1,69 @@ +SET(PYTHON_MODULES + ${PYTHON_MODULE_DIR}/arraymodule.o + ${PYTHON_MODULE_DIR}/_asynciomodule.o + ${PYTHON_MODULE_DIR}/audioop.o + ${PYTHON_MODULE_DIR}/binascii.o + ${PYTHON_MODULE_DIR}/_bisectmodule.o + ${PYTHON_MODULE_DIR}/_blake2/blake2module.o ${PYTHON_MODULE_DIR}/_blake2/blake2b_impl.o ${PYTHON_MODULE_DIR}/_blake2/blake2s_impl.o + ${PYTHON_MODULE_DIR}/_bz2module.o + ${PYTHON_MODULE_DIR}/cmathmodule.o + # ${PYTHON_MODULE_DIR}/_math.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_cn.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_hk.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_iso2022.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_jp.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_kr.o + ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_tw.o + ${PYTHON_MODULE_DIR}/_contextvarsmodule.o + ${PYTHON_MODULE_DIR}/_cryptmodule.o + ${PYTHON_MODULE_DIR}/_csv.o + ${PYTHON_MODULE_DIR}/_ctypes/_ctypes.o ${PYTHON_MODULE_DIR}/_ctypes/callbacks.o ${PYTHON_MODULE_DIR}/_ctypes/callproc.o ${PYTHON_MODULE_DIR}/_ctypes/stgdict.o ${PYTHON_MODULE_DIR}/_ctypes/cfield.o + ${PYTHON_MODULE_DIR}/_ctypes/_ctypes_test.o + ${PYTHON_MODULE_DIR}/_cursesmodule.o + ${PYTHON_MODULE_DIR}/_curses_panel.o + ${PYTHON_MODULE_DIR}/_datetimemodule.o + ${PYTHON_MODULE_DIR}/_decimal/_decimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/basearith.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/constants.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/context.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/convolute.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/crt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/difradix2.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fnt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fourstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/io.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/memory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/mpdecimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/numbertheory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/sixstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/transpose.o + ${PYTHON_MODULE_DIR}/_elementtree.o + ${PYTHON_MODULE_DIR}/fcntlmodule.o + ${PYTHON_MODULE_DIR}/grpmodule.o + ${PYTHON_MODULE_DIR}/_hashopenssl.o + ${PYTHON_MODULE_DIR}/_heapqmodule.o + ${PYTHON_MODULE_DIR}/_json.o + ${PYTHON_MODULE_DIR}/_lsprof.o + ${PYTHON_MODULE_DIR}/_lzmamodule.o + ${PYTHON_MODULE_DIR}/mathmodule.o + ${PYTHON_MODULE_DIR}/md5module.o + ${PYTHON_MODULE_DIR}/mmapmodule.o + ${PYTHON_MODULE_DIR}/cjkcodecs/multibytecodec.o + ${PYTHON_MODULE_DIR}/_multiprocessing/multiprocessing.o ${PYTHON_MODULE_DIR}/_multiprocessing/semaphore.o + ${PYTHON_MODULE_DIR}/nismodule.o + ${PYTHON_MODULE_DIR}/_opcode.o + ${PYTHON_MODULE_DIR}/ossaudiodev.o + ${PYTHON_MODULE_DIR}/parsermodule.o + ${PYTHON_MODULE_DIR}/_pickle.o + ${PYTHON_MODULE_DIR}/_posixsubprocess.o + ${PYTHON_MODULE_DIR}/pyexpat.o ${PYTHON_MODULE_DIR}/expat/xmlparse.o ${PYTHON_MODULE_DIR}/expat/xmlrole.o ${PYTHON_MODULE_DIR}/expat/xmltok.o + ${PYTHON_MODULE_DIR}/_queuemodule.o + ${PYTHON_MODULE_DIR}/_randommodule.o + ${PYTHON_MODULE_DIR}/readline.o + ${PYTHON_MODULE_DIR}/resource.o + ${PYTHON_MODULE_DIR}/selectmodule.o + ${PYTHON_MODULE_DIR}/sha1module.o + ${PYTHON_MODULE_DIR}/sha256module.o + ${PYTHON_MODULE_DIR}/_sha3/sha3module.o + ${PYTHON_MODULE_DIR}/sha512module.o + ${PYTHON_MODULE_DIR}/socketmodule.o + ${PYTHON_MODULE_DIR}/spwdmodule.o + ${PYTHON_MODULE_DIR}/_ssl.o + ${PYTHON_MODULE_DIR}/_struct.o + ${PYTHON_MODULE_DIR}/syslogmodule.o + ${PYTHON_MODULE_DIR}/termios.o + ${PYTHON_MODULE_DIR}/_testbuffer.o + ${PYTHON_MODULE_DIR}/_testcapimodule.o + ${PYTHON_MODULE_DIR}/_testimportmultiple.o + ${PYTHON_MODULE_DIR}/_testmultiphase.o + ${PYTHON_MODULE_DIR}/unicodedata.o + ${PYTHON_MODULE_DIR}/xxlimited.o + ${PYTHON_MODULE_DIR}/_xxtestfuzz/_xxtestfuzz.o ${PYTHON_MODULE_DIR}/_xxtestfuzz/fuzzer.o + ${PYTHON_MODULE_DIR}/zlibmodule.o +) diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py new file mode 100644 index 000000000000..459b7be9381c --- /dev/null +++ b/torch/csrc/deploy/interpreter/freeze.py @@ -0,0 +1,269 @@ +""" +Freeze Python packages. + +Freezing makes it possible to ship arbitrary Python modules as part of a C++ +library. The Python source of the module is compiled to bytecode and written +to `.c` files, to be imported by Python's built-in FrozenImporter. + +In a normal Python installation, FrozenImporter is only used to bootstrap the +initialization of the import machinery. Python's importers are defined in +Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be +retrieved before any importers are available. Freezing the module bytecode +resolves this circular dependency. + +This script will freeze the Python standard library. It produces two things: +- Bytecode files: A set of `.c` that define C variables containing Python bytecode. +- Main file: A `main.c` file listing all of these modules in the right form to be + consumed by FrozenImporter. + +The library that wishes to these modules make them available to the local +Python instance by extending `PyImport_FrozenModules` appropriately (see +https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules). +""" + +import argparse +import functools +import itertools +import marshal +import os +from dataclasses import dataclass +from pathlib import Path +from typing import List + + +MAIN_INCLUDES = """#include + +""" + +MAIN_PREFIX = """ +// Compiled standard library modules. These should be appended to the existing +// `PyImport_FrozenModules` that ships with CPython. +struct _frozen _PyImport_FrozenModules_torch[] = { +""" + +FAKE_PREFIX = """ +// Compiled standard library modules. These should be appended to the existing +// `PyImport_FrozenModules` that ships with CPython. +struct _frozen _PyImport_FrozenModules[] = { +""" + +MAIN_SUFFIX = """\ + {0, 0, 0} /* sentinel */ +}; +""" + +# Exclude some standard library modules to: +# 1. Slim down the final frozen lib. +# 2. Remove functionality we don't want to support. +DENY_LIST = [ + # Interface to unix databases + "dbm", + # ncurses bindings (terminal interfaces) + "curses", + # Tcl/Tk GUI + "tkinter", + "tkinter", + # Tests for the standard library + "test", + "tests", + "idle_test", + "__phello__.foo.py", + # importlib frozen modules. These are already baked into CPython. + "_bootstrap.py", + "_bootstrap_external.py", +] + +NUM_BYTECODE_FILES = 5 + + +def indent_msg(fn): + @functools.wraps(fn) + def wrapper(*args, **kwargs): + args[0].indent += 1 + ret = fn(*args, **kwargs) + args[0].indent -= 1 + return ret + + return wrapper + + +@dataclass +class FrozenModule: + # The fully qualified module name, e.g. 'foo.bar.baz' + module_name: str + # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz' + c_name: str + # The size of the C variable. Negative if this module is a package. + size: int + # The frozen bytecode + bytecode: bytes + + +class Freezer: + def __init__(self, verbose: bool): + self.frozen_modules: List[FrozenModule] = [] + self.indent: int = 0 + self.verbose: bool = verbose + + def msg(self, path: Path, code: str): + if not self.verbose: + return + # P: package dir + # F: python file + # S: skipped (not a package dir) + # X: skipped (deny-listed) + # N: skipped (not a python file) + for i in range(self.indent): + print(" ", end="") + print(f"{code} {path}") + + def write_bytecode(self, install_root): + """ + Write the `.c` files containing the frozen bytecode. Shard frozen + modules evenly across the files. + """ + bytecode_file_names = [ + f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES) + ] + bytecode_files = [open(os.path.join(install_root, name), "w") for name in bytecode_file_names] + it = itertools.cycle(bytecode_files) + for m in self.frozen_modules: + self.write_frozen(m, next(it)) + + for f in bytecode_files: + f.close() + + def write_main(self, install_root, oss): + """ + Write the `main.c` file containing a table enumerating all the + frozen modules. + """ + with open(os.path.join(install_root, "main.c"), "w") as outfp: + outfp.write(MAIN_INCLUDES) + for m in self.frozen_modules: + outfp.write(f"extern unsigned char {m.c_name}[];\n") + + outfp.write(MAIN_PREFIX) + for m in self.frozen_modules: + outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n') + outfp.write(MAIN_SUFFIX) + if oss: + outfp.write(FAKE_PREFIX) + outfp.write(MAIN_SUFFIX) + + def write_frozen(self, m: FrozenModule, outfp): + """ + Write a single frozen module's bytecode out to a C variable. + """ + outfp.write(f"unsigned char {m.c_name}[] = {{") + for i in range(0, len(m.bytecode), 16): + outfp.write("\n\t") + for c in bytes(m.bytecode[i : i + 16]): + outfp.write("%d," % c) + outfp.write("\n};\n") + + def compile_path(self, path: Path, top_package_path: Path): + """Generic entry point for compiling a Path object.""" + if path.is_dir(): + self.compile_package(path, top_package_path) + else: + self.compile_file(path, top_package_path) + + @indent_msg + def compile_package(self, path: Path, top_package_path: Path): + """Compile all the files within a Python package dir.""" + assert path.is_dir() + if path.name in DENY_LIST: + self.msg(path, "X") + return + + # Python packages are directories that have __init__.py in them. + is_package_dir = any([child.name == "__init__.py" for child in path.iterdir()]) + if not is_package_dir: + self.msg(path, "S") + return + + self.msg(path, "P") + # Recursively compile all children in this dir + for child in path.iterdir(): + self.compile_path(child, top_package_path) + + def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]: + # `path` looks like 'Lib/foo/bar/baz.py' + + # chop off 'Lib/' to get something that represents a Python module hierarchy. + # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz' + normalized_path = file_path.relative_to(top_package_path.parent) + + if normalized_path.name == "__init__.py": + # Special handling for `__init__.py`. In this case, this file + # specifies that the containing directory should be treated as a package. + # For 'foo/bar/baz/__init__.py': + # - The module name is 'baz' + module_basename = normalized_path.parent.name + # - The parent is foo.bar (need to shave off the 'baz') + module_parent = normalized_path.parent.parent.parts + else: + module_basename = normalized_path.stem + module_parent = normalized_path.parent.parts + return list(module_parent) + [module_basename] + + @indent_msg + def compile_file(self, path: Path, top_package_path: Path): + """ + Compile a Python source file to frozen bytecode. Append the result to + `self.frozen_modules`. + """ + assert path.is_file() + if path.suffix != ".py": + self.msg(path, "N") + return + + if path.name in DENY_LIST: + self.msg(path, "X") + return + + self.msg(path, "F") + module_qualname = self.get_module_qualname(path, top_package_path) + module_mangled_name = "__".join(module_qualname) + c_name = "M_" + module_mangled_name + + with open(path, "r") as src_file: + co = compile(src_file.read(), path, "exec") + + bytecode = marshal.dumps(co) + size = len(bytecode) + if path.name == '__init__.py': + # Python packages are signified by negative size. + size = -size + self.frozen_modules.append( + FrozenModule(".".join(module_qualname), c_name, size, bytecode) + ) + + +parser = argparse.ArgumentParser(description="Compile py source") +parser.add_argument("paths", nargs="*", help="Paths to freeze.") +parser.add_argument("--verbose", action="store_true", help="Print debug logs") +parser.add_argument("--install_dir", help="Root directory for all output files") +parser.add_argument("--fbcode_dir", help="Root directory for all output files") +parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules") + +args = parser.parse_args() + +f = Freezer(args.verbose) + +for p in args.paths: + if args.fbcode_dir: + p = os.path.join(args.fbcode_dir, p) + path = Path(p) + if path.is_dir() and not Path.exists(path / '__init__.py'): + # this 'top level path p' is a standard directory containing modules, + # not a module itself + # each 'mod' could be a dir containing __init__.py or .py file + for mod in path.glob("*"): + f.compile_path(mod, mod) + else: + f.compile_path(path, path) + +f.write_bytecode(args.install_dir) +f.write_main(args.install_dir, args.oss) diff --git a/torch/csrc/deploy/interpreter/hide_symbols.script b/torch/csrc/deploy/interpreter/hide_symbols.script new file mode 100644 index 000000000000..c748c8bfec95 --- /dev/null +++ b/torch/csrc/deploy/interpreter/hide_symbols.script @@ -0,0 +1,5 @@ +INTERPRETER_0.1 { + global: + initialize_interface; + local: *; # hide everything else +}; diff --git a/torch/csrc/deploy/interpreter/interpreter.cpp b/torch/csrc/deploy/interpreter/interpreter.cpp new file mode 100644 index 000000000000..7d685d33435c --- /dev/null +++ b/torch/csrc/deploy/interpreter/interpreter.cpp @@ -0,0 +1,324 @@ +#include + +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +using namespace py::literals; + +// TODO this should come from cmake +#define DEBUG 0 +template +const auto PYOBJ_ASSERT(T obj) { +#if (DEBUG == 1) + if (NULL == obj) { + PyErr_Print(); + } +#endif + TORCH_INTERNAL_ASSERT(NULL != obj); +} + +static wchar_t* program; + +#define FOREACH_LIBRARY(_) \ + _(array) \ + _(_asyncio) \ + _(audioop) \ + _(binascii) \ + _(_bisect) \ + _(_blake2) \ + _(_bz2) \ + _(cmath) \ + _(_codecs_cn) \ + _(_codecs_hk) \ + _(_codecs_iso2022) \ + _(_codecs_jp) \ + _(_codecs_kr) \ + _(_codecs_tw) \ + _(_contextvars) \ + _(_crypt) \ + _(_csv) \ + _(_ctypes) \ + _(_ctypes_test) \ + _(_curses) \ + _(_curses_panel) \ + _(_datetime) \ + _(_decimal) \ + _(_elementtree) \ + _(fcntl) \ + _(grp) \ + _(_hashlib) \ + _(_heapq) \ + _(_json) \ + _(_lsprof) \ + _(_lzma) \ + _(math) \ + _(_md5) \ + _(mmap) \ + _(_multibytecodec) \ + _(_multiprocessing) \ + _(nis) \ + _(_opcode) \ + _(ossaudiodev) \ + _(parser) \ + _(_pickle) \ + _(_posixsubprocess) \ + _(pyexpat) \ + _(_queue) \ + _(_random) \ + _(readline) \ + _(resource) \ + _(select) \ + _(_sha1) \ + _(_sha256) \ + _(_sha3) \ + _(_sha512) \ + _(_socket) \ + _(spwd) \ + _(_ssl) \ + _(_struct) \ + _(syslog) \ + _(termios) \ + _(_testbuffer) \ + _(_testcapi) \ + _(_testimportmultiple) \ + _(_testmultiphase) \ + _(unicodedata) \ + _(xxlimited) \ + _(_xxtestfuzz) \ + _(zlib) + +#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void); +FOREACH_LIBRARY(DECLARE_LIBRARY_INIT) +#undef DECLARE_LIBRARY_INIT + +extern "C" __attribute__((visibility("default"))) void initialize_interface( + InterpreterImpl* s) { +#define INITIALIZE_MEMBER(func) s->func = func; + FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER) +#undef INITIALIZE_MEMBER +} + +// These numbers of modules should not change as long as the cpython version +// embedded in the build remains fixed +static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6; +static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680; + +// We need to preserve the existing FrozenModules list, since it includes +// important importlib machinery. This code is adapted from the similar +// `PyImport_ExtendInittab`. +int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) { + struct _frozen *p = nullptr; + size_t a = 0, b = 0, c = 0; + int res = 0; + + /* Count the number of entries in both tables */ + for (a = 0; frozenpython[a].name != nullptr; a++) { + // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl; + } + for (b = 0; frozentorch[b].name != nullptr; b++) { + // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl; + } + for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) { + // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl; + } + + // Num frozen builtins shouldn't change (unless modifying the underlying cpython version) + TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules"); + // Check a+b together since in OSS a is empty and b contains stdlib+torch, while + // in fbcode they are separated due to thirdparty2 frozenpython. + // No fixed number of torch modules to check for, but there should be at least one. + TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules"); + + /* Allocate new memory for the combined table */ + if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) { + size_t size = sizeof(struct _frozen) * (a + b + c + 1); + p = (_frozen*)PyMem_Realloc(p, size); + } + if (p == nullptr) { + return -1; + } + + /* Copy the tables into the new memory */ + memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen)); + memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen)); + memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen)); + PyImport_FrozenModules = p; + return res; +} + +// We need to register a custom finder because we are registering `torch._C` as +// a built-in module, and it will otherwise get skipped by the default importer. +const char* finder = R"RAW( +import sys +# Remove the path-based importer, as we don't want our isolated interpreter to read the file system +sys.meta_path = sys.meta_path[:-1] + +class F: + def find_spec(self, fullname, path, target=None): + if fullname == 'torch._C': + return sys.meta_path[1].find_spec('torch._C', None, None) + return None +sys.meta_path.insert(0, F()) + +# make loader importable +)RAW"; + +const char* sysprint = R"RAW( +import sys +print("exec_prefix:", sys.base_exec_prefix) +print("_base_executable:", sys._base_executable) +print("base_prefix:", sys.base_prefix) +print("exec_prefix:", sys.exec_prefix) +print("executable:", sys.executable) +print("path:", sys.path) +print("prefix:", sys.prefix) + +)RAW"; + +extern "C" PyObject* initModule(void); +extern "C" struct _frozen _PyImport_FrozenModules[]; +extern "C" struct _frozen _PyImport_FrozenModules_torch[]; + +static std::atomic s_id; +std::map forwards; + +__attribute__((constructor)) void init() { + +} + +void startup() { +#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name); + FOREACH_LIBRARY(APPEND_INIT) +#undef APPEND_INIT + PyImport_AppendInittab("torch._C", initModule); + + int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch); + TORCH_INTERNAL_ASSERT(ret == 0); + + PyPreConfig preconfig; + PyPreConfig_InitIsolatedConfig(&preconfig); + PyStatus status = Py_PreInitialize(&preconfig); + TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status)) + + PyConfig config; + PyConfig_InitIsolatedConfig(&config); + + // Completely blank out the path configuration. This ensures we have complete + // control of how our embedded Python searches for modules, and we will never + // consult the external filesystem. See: + // https://docs.python.org/3/c-api/init_config.html#path-configuration + config.site_import = 0; + + status = PyConfig_SetString(&config, &config.base_exec_prefix, L""); + status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy"); + status = PyConfig_SetString(&config, &config.base_prefix, L""); + status = PyConfig_SetString(&config, &config.exec_prefix, L""); + status = PyConfig_SetString(&config, &config.executable, L"torch_deploy"); + status = PyConfig_SetString(&config, &config.prefix, L""); + + + config.module_search_paths_set = 1; + std::array module_search_paths = {}; + status = PyConfig_SetWideStringList( + &config, &config.module_search_paths, 0, module_search_paths.data()); + + status = Py_InitializeFromConfig(&config); + PyConfig_Clear(&config); + TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status)) + + // Uncomment to debug python config + // PyRun_SimpleString(sysprint); + + PyRun_SimpleString(finder); + // Release the GIL that PyInitialize acquires + PyEval_SaveThread(); +} + +void teardown() { + PyGILState_Ensure(); + + if (Py_FinalizeEx() < 0) { + std::cout << "IT BROKE SO WE ARE EXITING\n"; + exit(120); + } + PyMem_RawFree(program); +} + +__attribute__((destructor)) void deinit() {} + +void run_some_python(const char* code) { + PyGILState_STATE gstate = PyGILState_Ensure(); + + if (PyRun_SimpleString(code) == -1) { + throw std::runtime_error("python eval failed\n"); + } + PyGILState_Release(gstate); +} + +void run_python_file(const char* code) { + PyGILState_STATE gstate = PyGILState_Ensure(); + + FILE* f = fopen(code, "r"); + if (PyRun_SimpleFile(f, code) == -1) { + throw std::runtime_error("python eval failed\n"); + } + fclose(f); + + PyGILState_Release(gstate); +} + + +size_t load_model(const char* filename, bool hermetic) { + PyGILState_STATE gstate = PyGILState_Ensure(); + TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1); + std::string code; + + if (hermetic) { + code = fmt::format(R"( +from torch.package import PackageImporter + +i = PackageImporter('{}') +model = i.load_pickle('model', 'model.pkl') +)", filename); + } else { + code = std::string("model = torch.jit.load('") + + std::string(filename) + std::string("')"); + } + py::exec(code); + + auto id = ++s_id; + + PyGILState_Release(gstate); + return id; +} + +at::Tensor forward_model(size_t model_id, at::Tensor const & input) { + at::Tensor output; + PyGILState_STATE gstate = PyGILState_Ensure(); + { + TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1); + auto forward = py::globals()["model"].attr("forward"); + + py::object py_output = forward(input); + // TODO is this going to leak? + // added it to prevent crash wehn using 'output' tensor in callee of + // forward() + py_output.inc_ref(); + output = py::cast(py_output); + } + + PyGILState_Release(gstate); + + return output; + // return input; +} diff --git a/torch/csrc/deploy/interpreter/interpreter.h b/torch/csrc/deploy/interpreter/interpreter.h new file mode 100644 index 000000000000..29e435e44970 --- /dev/null +++ b/torch/csrc/deploy/interpreter/interpreter.h @@ -0,0 +1,67 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +class Interpreter : public InterpreterImpl { + private: + std::string library_name_; + void* handle_; + + public: + Interpreter() : handle_(nullptr) { + char library_name[L_tmpnam]; + library_name_ = library_name; + char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH"); + if (libinterpreter_path == nullptr) { + throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env."); + } + std::tmpnam(library_name); + { + std::ifstream src(libinterpreter_path, std::ios::binary); + std::ofstream dst(library_name, std::ios::binary); + dst << src.rdbuf(); + } + handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY); + if (!handle_) { + throw std::runtime_error(dlerror()); + } + + // technically, we can unlike the library right after dlopen, and this is + // better for cleanup because even if we crash the library doesn't stick + // around. However, its crap for debugging because gdb can't find the + // symbols if the library is no longer present. + unlink(library_name_.c_str()); + + void* initialize_interface = dlsym(handle_, "initialize_interface"); + if (!initialize_interface) { + throw std::runtime_error("Unable to load initialize_interface function from interpreter lib."); + } + ((void (*)(InterpreterImpl*))initialize_interface)(this); + + this->startup(); + + // the actual torch loading process is not thread safe, by doing it + // in the constructor before we have multiple worker threads, then we + // ensure it doesn't race. + run_some_python("import torch"); + } + ~Interpreter() { + if (handle_) { + this->teardown(); + + // it segfaults its face off trying to unload, but it's not clear + // if this is something we caused of if libtorch_python would also do the + // same if it were opened/closed a lot... + dlclose(handle_); + } + } + Interpreter(const Interpreter&) = delete; +}; diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h new file mode 100644 index 000000000000..82326bd370f1 --- /dev/null +++ b/torch/csrc/deploy/interpreter/interpreter_impl.h @@ -0,0 +1,26 @@ +#pragma once +#include + +// NOTE- if adding new interface functions, +// update interpreter.cpp initialize_interface. +size_t load_model(const char* model_file, bool hermetic=false); +at::Tensor forward_model(size_t model_id, at::Tensor const & input); +void run_some_python(const char* code); +void startup(); +void teardown(); +void run_python_file(const char* code); + + +#define FOREACH_INTERFACE_FUNCTION(_) \ + _(load_model) \ + _(forward_model) \ + _(run_some_python) \ + _(startup) \ + _(teardown) \ + _(run_python_file) + +struct InterpreterImpl { +#define DEFINE_POINTER(func) decltype(&::func) func; + FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER) +#undef DEFINE_POINTER +}; diff --git a/torch/csrc/deploy/interpreter/test_main.cpp b/torch/csrc/deploy/interpreter/test_main.cpp new file mode 100644 index 000000000000..6107267c9f29 --- /dev/null +++ b/torch/csrc/deploy/interpreter/test_main.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + + int rc = RUN_ALL_TESTS(); + + return rc; +} + +TEST(Interpreter, Sanity) { + ASSERT_TRUE(true); +} + +TEST(Interpreter, Hello) { + Interpreter interp; + interp.run_some_python("print('hello from first interpeter!')"); + + Interpreter interp2; + interp2.run_some_python("print('hello from second interpeter!')"); +} + +void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) { + Interpreter interp; + // Test + auto model_id = interp.load_model(model_filename, false); + at::Tensor output = interp.forward_model(model_id, input); + + // Reference + auto ref_model = torch::jit::load(model_filename); + std::vector ref_inputs; + ref_inputs.emplace_back(torch::jit::IValue(input)); + at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor(); + + ASSERT_TRUE(ref_output.equal(output)); +} + +TEST(Interpreter, SimpleModel) { + char* model_path = std::getenv("SIMPLE_MODEL_PATH"); + ASSERT_NE(model_path, nullptr); + const int A = 10, B = 20; + compare_torchpy_jit( + model_path, torch::ones(at::IntArrayRef({A, B}))); +} diff --git a/torch/csrc/deploy/interpreter/third_party/README.md b/torch/csrc/deploy/interpreter/third_party/README.md new file mode 100644 index 000000000000..2c5d9241d2bb --- /dev/null +++ b/torch/csrc/deploy/interpreter/third_party/README.md @@ -0,0 +1,2 @@ +Python libraries that we want to package along with the Python implementation +bundled in libinterpreter. diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 7286387644ad..faddb8cb16e2 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -113,6 +113,10 @@ def _lazy_call(callable): if is_initialized(): callable() else: + # TODO(torch_deploy): this accesses linecache, which attempts to read the + # file system to get traceback info. Patch linecache or do something + # else here if this ends up being important. + # Don't store the actual traceback to avoid memory cycle _queued_calls.append((callable, traceback.format_stack())) diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py index df6a3793e90d..73eb7f93cf1c 100644 --- a/torch/utils/__init__.py +++ b/torch/utils/__init__.py @@ -2,6 +2,7 @@ from .throughput_benchmark import ThroughputBenchmark import os.path as _osp +import sys # Set the module for a given object for nicer printing def set_module(obj, mod): @@ -9,5 +10,8 @@ def set_module(obj, mod): raise TypeError("The mod argument should be a string") obj.__module__ = mod -#: Path to folder containing CMake definitions for Torch package -cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake') +if sys.executable == "torch_deploy": + # not valid inside torch_deploy interpreter, no paths exists for frozen modules + cmake_prefix_path = None +else: + cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake') From 1c9347c6666d0bb8b9793b504e9cb597b75f1401 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 26/41] [ONNX] Use parameter values in onnx shape inference (#49706) (#50905) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50905 Adds an additional run of onnx shape inference after constant folding, since initializer may have changed and affected shape inference. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050881 Pulled By: SplitInfinity fbshipit-source-id: 9e5d69c52b647133cd3a0781988e2ad1d1a9c09d --- ...erators.test_upsample_nearest_scale.expect | 8 +- ..._nearest_scale_default_scale_factor.expect | 8 +- test/onnx/test_pytorch_onnx_onnxruntime.py | 3 +- torch/_C/__init__.pyi.in | 4 +- .../jit/passes/onnx/shape_type_inference.cpp | 95 ++++++++++++------- .../jit/passes/onnx/shape_type_inference.h | 7 +- torch/csrc/jit/python/init.cpp | 12 ++- torch/onnx/utils.py | 26 +++-- 8 files changed, 102 insertions(+), 61 deletions(-) diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect index 5355daf4f3ca..67d765831c1b 100644 --- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect +++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect @@ -50,16 +50,16 @@ graph { elem_type: 1 shape { dim { - dim_param: "Upsample4_dim_0" + dim_value: 1 } dim { - dim_param: "Upsample4_dim_1" + dim_value: 2 } dim { - dim_param: "Upsample4_dim_2" + dim_value: 6 } dim { - dim_param: "Upsample4_dim_3" + dim_value: 8 } } } diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect index 5355daf4f3ca..67d765831c1b 100644 --- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect +++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect @@ -50,16 +50,16 @@ graph { elem_type: 1 shape { dim { - dim_param: "Upsample4_dim_0" + dim_value: 1 } dim { - dim_param: "Upsample4_dim_1" + dim_value: 2 } dim { - dim_param: "Upsample4_dim_2" + dim_value: 6 } dim { - dim_param: "Upsample4_dim_3" + dim_value: 8 } } } diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index d3d4ebef5f61..6cabcd728085 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -689,7 +689,7 @@ def forward(self, input): # Without empty optional arguments dictionary x = torch.randn(2, 3) - self.run_test(NoOptionalModel(), (x,), input_names=['input_x']) + self.run_test(NoOptionalModel(), (x,), input_names=['input_x']) # With empty optional arguments dictionary y = torch.randn(2, 3) self.run_test(NoOptionalModel(), (y, {})) @@ -3668,6 +3668,7 @@ def forward(self, input): self.run_test(SplitModel2(), x) @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() def test_chunk(self): class ChunkModel(torch.nn.Module): def __init__(self): diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index a7ebf35cc0b6..6ebae6b80c1f 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -261,7 +261,7 @@ def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def def _jit_pass_lower_all_tuples(graph: Graph) -> None: ... def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ... -def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ... +def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ... def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool = False) -> None: ... def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ... def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ... @@ -298,7 +298,7 @@ def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IV def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ... def _jit_decay_packed_param_input_types(graph: Graph) -> None: ... -def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ... +def _jit_pass_onnx_node_shape_type_inference(n: Node, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ... def _jit_pass_onnx_block( old_block: Block, new_block: Block, diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index bc1cabf81ddf..8d2ff1e63cfc 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -258,34 +258,50 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr n_graph) { } // Clone the node n for the new graph. -Node* CloneNodeToGraph(Node* n, std::shared_ptr n_graph) { - auto clone_node = n_graph->createClone(n, [&n_graph](Value* v) { - auto v_n = v->node(); - switch (v_n->kind()) { - case ::c10::onnx::Constant: { - // Clone the input if it is constant. - auto constant_n = n_graph->insertNode( - n_graph->createClone(v_n, [](Value* v) { return v; })); - return constant_n->output(); - } - case ::c10::prim::ListConstruct: { - return CloneValueFromListConstruct(v, n_graph); - } - case ::c10::prim::PackPadded: { - auto input = n_graph->addInput(); - input->copyMetadata(v_n->input(0)); - return input; - } - default: { - // If the input is not constant, we cannot depend on its value - // in shape inference. Set it to graph input in the new graph, - // and copy over metadata, such as datatype and shape. - auto input = n_graph->addInput(); - input->copyMetadata(v); - return input; - } - } - }); +Node* CloneNodeToGraph( + Node* n, + std::shared_ptr n_graph, + const ParamMap& params_dict) { + auto vals_to_params_map = + buildValueToParamsMap(n->owningGraph()->block(), params_dict); + auto clone_node = + n_graph->createClone(n, [&n_graph, &vals_to_params_map](Value* v) { + auto v_n = v->node(); + switch (v_n->kind()) { + case ::c10::onnx::Constant: { + // Clone the input if it is constant. + auto constant_n = n_graph->insertNode( + n_graph->createClone(v_n, [](Value* v) { return v; })); + return constant_n->output(); + } + case ::c10::prim::ListConstruct: { + return CloneValueFromListConstruct(v, n_graph); + } + case ::c10::prim::PackPadded: { + auto input = n_graph->addInput(); + input->copyMetadata(v_n->input(0)); + return input; + } + default: { + if (vals_to_params_map.find(v) != vals_to_params_map.end()) { + // If the input is a parameter, insert a constant of its value as + // input. + auto val = vals_to_params_map.find(v)->second.second.toTensor(); + return n_graph + ->insertNode(n_graph->create(::c10::onnx::Constant) + ->t_(attr::value, val)) + ->output(); + } else { + // If the input is not constant, we cannot depend on its value + // in shape inference. Set it to graph input in the new graph, + // and copy over metadata, such as datatype and shape. + auto input = n_graph->addInput(); + input->copyMetadata(v); + return input; + } + } + } + }); return clone_node; } @@ -433,19 +449,25 @@ void FetchBlockInputMetadataFromParent(Block* b) { } } -void ONNXShapeTypeInference(Block* b, int opset_version) { +void ONNXShapeTypeInference( + Block* b, + const ParamMap& params_dict, + int opset_version) { FetchBlockInputMetadataFromParent(b); for (auto n : b->nodes()) { for (auto subblock : n->blocks()) { - ONNXShapeTypeInference(subblock, opset_version); + ONNXShapeTypeInference(subblock, params_dict, opset_version); } - ONNXShapeTypeInference(n, opset_version); + ONNXShapeTypeInference(n, params_dict, opset_version); } } } // namespace -void ONNXShapeTypeInference(Node* n, int opset_version) { +void ONNXShapeTypeInference( + Node* n, + const ParamMap& params_dict, + int opset_version) { GRAPH_UPDATE( "Running ONNX shape inference for node: ", n->kind().toDisplayString()); if (!IsSupportedNode(n)) { @@ -454,7 +476,7 @@ void ONNXShapeTypeInference(Node* n, int opset_version) { // Create a Graph containing only the single node n. // This graph is later converted to ONNX to run shape inference. auto n_graph = std::make_shared(); - auto clone_node = CloneNodeToGraph(n, n_graph); + auto clone_node = CloneNodeToGraph(n, n_graph, params_dict); n_graph->insertNode(clone_node); // Register all node outputs as graph outputs. @@ -690,8 +712,11 @@ void ONNXAssignOutputShape( Py_DECREF(py_obj); } -void ONNXShapeTypeInference(std::shared_ptr& graph, int opset_version) { - ONNXShapeTypeInference(graph->block(), opset_version); +void ONNXShapeTypeInference( + std::shared_ptr& graph, + const ParamMap& params_dict, + int opset_version) { + ONNXShapeTypeInference(graph->block(), params_dict, opset_version); } } // namespace jit diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h index bac7e2439ca9..69fbff1d175c 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.h +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace torch { @@ -34,7 +35,10 @@ TORCH_API void ONNXAssignOutputShape( // The node must have ONNX namespace, and is valid ONNX node accroding to spec. // On successful ONNX shape inference runs, the function updates output types of // n with inferred shape and type. Otherwise n is unchanged. -TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version); +TORCH_API void ONNXShapeTypeInference( + Node* n, + const ParamMap& params_dict, + int opset_version); // Utilize ONNX Shape Inference for graph. // Internally calls ONNXShapeTypeInference for each node, to achieve more @@ -42,6 +46,7 @@ TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version); // the entire graph. TORCH_API void ONNXShapeTypeInference( std::shared_ptr& g, + const ParamMap& params_dict, int opset_version); } // namespace jit diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 2a91bd497e7b..197af7179361 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -210,13 +210,17 @@ void initJITBindings(PyObject* module) { PrepareInplaceOpsForONNX) .def( "_jit_pass_onnx_node_shape_type_inference", - [](Node* n, int opset_version) { - ONNXShapeTypeInference(n, opset_version); + [](Node* n, + std::map& params_dict, + int opset_version) { + ONNXShapeTypeInference(n, params_dict, opset_version); }) .def( "_jit_pass_onnx_graph_shape_type_inference", - [](std::shared_ptr& graph, int opset_version) { - ONNXShapeTypeInference(graph, opset_version); + [](std::shared_ptr& graph, + std::map& params_dict, + int opset_version) { + ONNXShapeTypeInference(graph, params_dict, opset_version); }) .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape) .def("_jit_pass_fuse", FuseGraph) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 59d45de1f553..7a483c2f728b 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -29,6 +29,8 @@ def is_in_onnx_export(): global __IN_ONNX_EXPORT return __IN_ONNX_EXPORT +# Skip check due to cannot import IValue from torch._C +_params_dict = {} # type: ignore @contextlib.contextmanager def select_model_mode_for_export(model, mode): @@ -224,7 +226,7 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa torch._C._jit_pass_lint(graph) from torch.onnx.symbolic_helper import _onnx_shape_inference, _export_onnx_opset_version if _onnx_shape_inference: - torch._C._jit_pass_onnx_graph_shape_type_inference(graph, _export_onnx_opset_version) + torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version) return graph @@ -358,7 +360,7 @@ def _trace(func, args, operator_export_type, return_outs=False): torch.jit._get_trace_graph(func, args, strict=False, _force_outplace=False, _return_inputs_states=True) warn_on_static_input_change(inputs_states) - trace_graph = _optimize_graph(trace_graph, operator_export_type) + trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={}) if return_outs: return trace_graph, torch_out return trace_graph @@ -422,6 +424,11 @@ def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes): torch._C._jit_pass_onnx_function_substitution(graph) return graph, params, torch_out +def _get_named_param_dict(graph, params): + input_and_param_names = [val.debugName() for val in graph.inputs()] + param_names = input_and_param_names[len(input_and_param_names) - len(params):] + _params_dict = dict(zip(param_names, params)) + return _params_dict def _model_to_graph(model, args, verbose=False, input_names=None, output_names=None, @@ -443,9 +450,7 @@ def _model_to_graph(model, args, verbose=False, _retain_param_name, use_new_jit_passes) - input_and_param_names = [val.debugName() for val in graph.inputs()] - param_names = input_and_param_names[len(input_and_param_names) - len(params):] - params_dict = dict(zip(param_names, params)) + params_dict = _get_named_param_dict(graph, params) graph = _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=_disable_torch_constant_prop, @@ -479,9 +484,7 @@ def _model_to_graph(model, args, verbose=False, flatten_args, _ = torch._C._jit_flatten(args) assert len(params) + len(flatten_args) == sum(1 for _ in graph.inputs()) - input_and_param_names = [val.debugName() for val in graph.inputs()] - param_names = input_and_param_names[len(input_and_param_names) - len(params):] - params_dict = dict(zip(param_names, params)) + params_dict = _get_named_param_dict(graph, params) if training is None or training == TrainingMode.EVAL: params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) @@ -491,6 +494,9 @@ def _model_to_graph(model, args, verbose=False, _export_onnx_opset_version) torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph) + if _onnx_shape_inference: + torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version) + params_dict = torch._C._jit_pass_onnx_eliminate_unused_items(graph, params_dict) # For ONNX opset < 9, constants only have three data types: float16, float, double. @@ -878,7 +884,7 @@ def const_if_tensor(arg): from torch.onnx.symbolic_helper import _onnx_shape_inference if _onnx_shape_inference: from torch.onnx.symbolic_helper import _export_onnx_opset_version as opset_version - torch._C._jit_pass_onnx_node_shape_type_inference(n, opset_version) + torch._C._jit_pass_onnx_node_shape_type_inference(n, _params_dict, opset_version) if outputs == 1: return n.output() @@ -1032,7 +1038,7 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor # Process Loop and If after subblock is converted. from torch.onnx.symbolic_helper import _onnx_shape_inference if _onnx_shape_inference: - torch._C._jit_pass_onnx_node_shape_type_inference(new_node, opset_version) + torch._C._jit_pass_onnx_node_shape_type_inference(new_node, _params_dict, opset_version) return new_op_outputs else: symbolic_name = 'prim_' + op_name From 7e4c95695539c0fe0675ab0c91de761247143d8a Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 27/41] [ONNX] Support opset13 Squeeze and Unsqueeze (#50150) (#50906) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50906 In opset 13, squeeze/unsqueeze is updated to take axes as input, instead of attribute. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050883 Pulled By: SplitInfinity fbshipit-source-id: 7b5faf0e016d476bc75cbf2bfee6918d77e8aecd --- test/onnx/test_pytorch_onnx_onnxruntime.py | 119 +-------------- test/onnx/test_utility_funs.py | 4 +- torch/csrc/jit/passes/onnx/helper.cpp | 23 +++ torch/csrc/jit/passes/onnx/helper.h | 9 ++ torch/csrc/jit/passes/onnx/peephole.cpp | 137 +++++++++--------- .../jit/passes/onnx/shape_type_inference.cpp | 28 ++-- torch/onnx/symbolic_helper.py | 24 ++- torch/onnx/symbolic_opset10.py | 14 +- torch/onnx/symbolic_opset11.py | 35 +++-- torch/onnx/symbolic_opset12.py | 8 +- torch/onnx/symbolic_opset9.py | 70 ++++----- 11 files changed, 205 insertions(+), 266 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 6cabcd728085..5b7091b6612f 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -302,7 +302,6 @@ def forward(self, input): x = torch.tensor([2], dtype=torch.long) self.run_model_test_with_external_data(model, x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. def test_mobilenet_v2_with_external_data(self): model = torchvision.models.mobilenet_v2(pretrained=True) @@ -444,7 +443,6 @@ def get_test_images(self): images = [image] return images - @skipIfUnsupportedOpsetVersion([13]) def test_paste_mask_in_image(self): # disable profiling torch._C._jit_set_profiling_executor(False) @@ -525,22 +523,18 @@ def test_keypoint_rcnn(self): dynamic_axes={"images_tensors": [0, 1, 2]}, rtol=1e-3, atol=1e-5) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_word_language_model_RNN_TANH(self): self.run_word_language_model("RNN_TANH") - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_word_language_model_RNN_RELU(self): self.run_word_language_model("RNN_RELU") - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_word_language_model_LSTM(self): self.run_word_language_model("LSTM") - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_word_language_model_GRU(self): self.run_word_language_model("GRU") @@ -768,7 +762,6 @@ def forward(self, x, y=None, z=None): z = torch.randn(2, 3) self.run_test(Model(), (x, None, z)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_cste_script(self): class MyModel(torch.jit.ScriptModule): @@ -1037,44 +1030,37 @@ def forward(self, x): else: self.run_test(Squeeze(d), x1) - @skipIfUnsupportedOpsetVersion([13]) def test_squeeze_without_no_op(self): x = torch.randn(2, 1, 4) self.squeeze_model_tests(1, x, None) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_squeeze_dynamic(self): x_squeeze = torch.randn(2, 1, 4) x_noop = torch.randn(2, 2, 3) self.squeeze_model_tests(1, x_squeeze, x_noop) - @skipIfUnsupportedOpsetVersion([13]) def test_squeeze_neg_without_no_op(self): x = torch.randn(2, 1, 4) self.squeeze_model_tests(-2, x, None) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_squeeze_neg(self): x_squeeze = torch.randn(2, 1, 4) x_noop = torch.randn(2, 2, 3) self.squeeze_model_tests(-2, x_squeeze, x_noop) - @skipIfUnsupportedOpsetVersion([13]) def test_squeeze_all_dims(self): x_squeeze = torch.randn(2, 1, 4) x_noop = torch.randn(2, 2, 3) self.squeeze_model_tests(None, x_squeeze, x_noop) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_squeeze_no_op(self): x_noop = torch.randn(2, 1, 4) x_squeeze = torch.randn(2, 2, 1) self.squeeze_model_tests(2, x_noop, x_squeeze) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_squeeze_runtime_dim(self): class Squeeze(torch.nn.Module): @@ -1088,7 +1074,14 @@ def forward(self, d1, d2): self.run_test(Squeeze(), (d1, d4), test_with_inputs=[(d3, d4)]) self.run_test(Squeeze(), (d3, d4), test_with_inputs=[(d1, d3)]) - @skipIfUnsupportedOpsetVersion([13]) + def test_squeeze(self): + class Squeeze(torch.nn.Module): + def forward(self, x): + return torch.squeeze(x, dim=-2) + + x = torch.randn(2, 1, 4) + self.run_test(Squeeze(), x) + def test_unsqueeze(self): class Unsqueeze(torch.nn.Module): def forward(self, x): @@ -1288,7 +1281,6 @@ def forward(self, x, y): y = torch.randn(2, 3, 4) self.run_test(FloorDivModule(), (x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_floordiv(self): class FloordivModule(torch.nn.Module): @@ -1366,7 +1358,6 @@ def forward(self, x, y): y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.double) self.run_test(torch.jit.script(DivModule()), (x, y)) - @skipIfUnsupportedOpsetVersion([13]) def test_slice_trace(self): class MyModule(torch.nn.Module): def forward(self, x): @@ -1375,7 +1366,6 @@ def forward(self, x): x = torch.randn(3) self.run_test(MyModule(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_slice_neg(self): class NegSlice(torch.nn.Module): def forward(self, x): @@ -1384,7 +1374,6 @@ def forward(self, x): x = torch.randn(3, 4, 5) self.run_test(NegSlice(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_slice_neg_large(self): class NegSlice(torch.nn.Module): def forward(self, x): @@ -1393,7 +1382,6 @@ def forward(self, x): x = torch.randn(3, 4, 5, 6, 7) self.run_test(NegSlice(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_slice_neg_large_negone(self): class NegSlice(torch.nn.Module): def forward(self, x): @@ -1402,7 +1390,6 @@ def forward(self, x): x = torch.randn(3, 4, 5, 6, 7) self.run_test(NegSlice(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_slice_with_input_index(self): class InputIndexSlice(torch.nn.Module): @@ -1414,7 +1401,6 @@ def forward(self, x, y): y = torch.rand((22, 256)) self.run_test(InputIndexSlice(), (x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) @disableScriptTest() # scripting tuple/list append def test_slice_dynamic(self): @@ -1433,7 +1419,6 @@ def forward(self, x): dynamic_axes={'input_1': [0, 1, 2], 'output_1': [0, 1, 2]}) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) def test_slice_dynamic_script(self): class DynamicSliceModel(torch.jit.ScriptModule): @@ -1444,7 +1429,6 @@ def forward(self, x): x = torch.rand(1, 2) self.run_test(DynamicSliceModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) def test_slice_dynamic_shape_script(self): class DynamicSliceModel(torch.nn.Module): @@ -1454,7 +1438,6 @@ def forward(self, x): x = torch.rand(1, 2, 3, 4) self.run_test(DynamicSliceModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) @disableScriptTest() # scripting tuple/list append def test_slice_dynamic_to_end(self): @@ -1561,7 +1544,6 @@ def forward(self, end): x = torch.tensor(6.2, dtype=torch.float) self.run_test(ArangeModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_size(self): class SizeModel(torch.nn.Module): @@ -1571,7 +1553,6 @@ def forward(self, input): x = torch.randn(5, 3, 2) self.run_test(SizeModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() # x.stride() not scriptable def test_as_strided(self): @@ -1586,7 +1567,6 @@ def forward(self, x): x = torch.randn(5, 8, 7) self.run_test(Model(), x) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() # Ellipses followed by tensor indexing not scriptable def test_tensor_index_advanced_indexing_ellipsis(self): class MyModel(torch.nn.Module): @@ -1596,7 +1576,6 @@ def forward(self, input): m1 = torch.randn(3, 4, 5, 6, 7) self.run_test(MyModel(), (m1,)) - @skipIfUnsupportedOpsetVersion([13]) def test_tensor_index_advanced_indexing(self): class MyModel(torch.nn.Module): def forward(self, input): @@ -1617,7 +1596,6 @@ def forward(self, input): self.run_test(MyModel(), (m1,)) - @skipIfUnsupportedOpsetVersion([13]) def test_tensor_index_advanced_indexing_consecutive(self): class MyModel(torch.nn.Module): def forward(self, input): @@ -1626,7 +1604,6 @@ def forward(self, input): m1 = torch.randn(3, 4, 5, 6, 7) self.run_test(MyModel(), (m1,)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put(self): class IndexPutModel(torch.nn.Module): @@ -1639,7 +1616,6 @@ def forward(self, x, ind, update): update = torch.ones(4) self.run_test(IndexPutModel(), (x, ind, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_accumulate(self): class IndexPutModel(torch.nn.Module): @@ -1651,7 +1627,6 @@ def forward(self, x, ind, update): update = torch.ones(4) self.run_test(IndexPutModel(), (x, ind, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_slice_index(self): class IndexPutModel(torch.nn.Module): @@ -1726,7 +1701,6 @@ def forward(self, x, update): update = torch.arange(3 * 5).to(torch.float).view(3, 5) self.run_test(IndexPutModel8(), (x, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # Ellipses followed by tensor indexing not scriptable def test_index_put_ellipsis(self): @@ -1748,7 +1722,6 @@ def forward(self, x, update): update = torch.randn(4, 1, 3, 2) self.run_test(IndexPutModel2(), (x, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_loop(self): @torch.jit.script @@ -1831,7 +1804,6 @@ def forward(self, x, ind, data): data = torch.randn(4) self.run_test(CopyModel4(), (x, ind, data)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # Model not scriptable (output with shape doesn't match the broadcast shape) def test_copy_tracing(self): @@ -1844,7 +1816,6 @@ def forward(self, x, data): update = torch.randn(1, 2) self.run_test(CopyModel(), (x, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_copy_ellipsis(self): class CopyModel(torch.nn.Module): @@ -1860,7 +1831,6 @@ def forward(self, x, update): update = torch.ones(1) self.run_test(CopyModel(), (x, update)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) # TODO: Limited scripting support with ellipsis indexing. # Due to dependency on input tensor rank being known. @@ -1899,7 +1869,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Rand(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_random_dynamic_size(self): class RandN(torch.nn.Module): @@ -2062,12 +2031,10 @@ def _interpolate_tests(self, is_upsample): self._interpolate_script(xi, mode_i, False, is_upsample, True) self._interpolate_script(xi, mode_i, False, is_upsample) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_interpolate_upsample(self): self._interpolate_tests(True) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() @skipIfUnsupportedMinOpsetVersion(9) def test_interpolate_function_substitution(self): @@ -2098,13 +2065,11 @@ def forward(self, x): self.run_test(TracingModule(), (x,)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) @disableScriptTest() def test_interpolate_downsample(self): self._interpolate_tests(False) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() def test_interpolate_no_shape(self): @@ -2120,7 +2085,6 @@ def forward(self, x, y): y = torch.randn(16, 16, requires_grad=True) self.run_test(MyModel(), (x, y)) - @skipIfUnsupportedOpsetVersion([13]) def test_interpolate_adaptive_pooling_error(self): x = torch.randn(1, 2, 6, requires_grad=True) with self.assertRaises(RuntimeError) as cm: @@ -2129,7 +2093,6 @@ def test_interpolate_adaptive_pooling_error(self): with self.assertRaises(RuntimeError) as cm: self._interpolate(x, "area", False, True) - @skipIfUnsupportedOpsetVersion([13]) def test_groupnorm(self): model = torch.nn.GroupNorm(3, 6, 0.002) x = torch.randn(4, 6, 180, 180, 180) @@ -2143,7 +2106,6 @@ def test_groupnorm(self): x = torch.randn(4, 6, 180, 180) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_groupnorm_noaffine(self): model = torch.nn.GroupNorm(4, 8, 0.002, affine=False) @@ -2158,7 +2120,6 @@ def test_groupnorm_noaffine(self): x = torch.randn(4, 6, 180, 180) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_listunpack(self): class ListUnpack(torch.jit.ScriptModule): @@ -2462,7 +2423,6 @@ def forward(self, input, input2): input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2) self.run_test(BitshiftModel(), (input, input2)) - @skipIfUnsupportedOpsetVersion([13]) def test_narrow(self): class NarrowModel(torch.nn.Module): def forward(self, input): @@ -2471,7 +2431,6 @@ def forward(self, input): x = torch.randn(3, 3, requires_grad=True) self.run_test(NarrowModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_narrow_dynamic(self): class NarrowModel(torch.nn.Module): @@ -2481,7 +2440,6 @@ def forward(self, input): x = torch.randn(3, 3, requires_grad=True) self.run_test(NarrowModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_index_fill(self): class IndexFillModel(torch.nn.Module): @@ -2492,7 +2450,6 @@ def forward(self, input): x = torch.randn(3, 4, 5, requires_grad=True) self.run_test(IndexFillModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_index_copy(self): class IndexCopyModel(torch.nn.Module): @@ -2734,7 +2691,6 @@ def forward(self, input, indices): indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(GatherModel(), input=(input, indices)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_expand(self): class ExpandModel(torch.nn.Module): @@ -2894,7 +2850,6 @@ def forward(self, x): x = torch.randn(3, 4, 5, requires_grad=True) self.run_test(Model(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() # scripting prim_dtype def test_lstm_no_hidden(self): @@ -2909,7 +2864,6 @@ def forward(self, x): input = torch.randn((10, 16, 16)) self.run_test(LSTMModel(), (input,)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() # scripting prim_dtype def test_lstm_proj_no_hidden(self): @@ -2926,7 +2880,6 @@ def forward(self, x): self.run_test(LSTMModel(), (input,)) @skipIfUnsupportedMinOpsetVersion(9) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_lstm(self): model = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) @@ -2935,7 +2888,6 @@ def test_lstm(self): c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE) self.run_test(model, (input, (h0, c0))) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() def test_lstm_default_init_state(self): @@ -2943,7 +2895,6 @@ def test_lstm_default_init_state(self): input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) self.run_test(model, input) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() # LSTMModel model not scriptable def test_lstm_fixed_batch_size(self): @@ -2965,7 +2916,6 @@ def forward(self, input): input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) self.run_test(LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2]) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() def test_lstm_post_fix_init_state(self): @@ -2990,7 +2940,6 @@ def forward(self, input): self.run_test(model, input, dynamic_axes={'input' : {0 : 'seq', 1 : 'batch'}}, test_with_inputs=[input2]) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_lstm_constant_folding(self): class LstmNet(torch.nn.Module): @@ -3018,7 +2967,6 @@ def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size model2, input2 = get_LstmNet_model_and_inputs(5, 4, 3, batch_size2, 7, False) self.run_test(model2, input2, do_constant_folding=True) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() def test_lstm_no_bias(self): @@ -3044,7 +2992,6 @@ def get_LstmNet_model_and_inputs(num_layers, bidirectional): for model, input in models_and_inputs: self.run_test(model, input) - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() def test_rnn_no_bias(self): def make_model(layers, packed_sequence): @@ -3084,7 +3031,6 @@ def make_input(batch_size, layers, packed_sequence): for model, input in zip(models, inputs): self.run_test(model, input, batch_size=RNN_BATCH_SIZE) - @skipIfUnsupportedOpsetVersion([13]) def test_gru_no_bias(self): class GruNet(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): @@ -3114,7 +3060,6 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, for model, input in models_and_inputs: self.run_test(model, input, do_constant_folding=True) - @skipIfUnsupportedOpsetVersion([13]) def test_gru_constant_folding(self): class GruNet(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): @@ -3377,7 +3322,6 @@ def test_argmin_argmax_select_last_index(self): input = torch.ones(7, 3, 5) self._argmin_argmax_model(input) - @skipIfUnsupportedOpsetVersion([13]) def test_repeat(self): class RepeatModel(torch.nn.Module): def forward(self, x, y): @@ -3397,7 +3341,6 @@ def forward(self, input): x = torch.randint(10, (4, 2, 3, 4), dtype=torch.int32) self.run_test(ViewModel(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_view_dynamic(self): class ViewModel(torch.nn.Module): def forward(self, input, other): @@ -3407,7 +3350,6 @@ def forward(self, input, other): shape = torch.randn(6, 4) self.run_test(ViewModel(), (x, shape)) - @skipIfUnsupportedOpsetVersion([13]) def test_view_dynamic_zero_dim(self): class ViewModel(torch.nn.Module): def forward(self, input): @@ -3567,7 +3509,6 @@ def forward(self, input): self.run_test(LenModel(), x, input_names=['input'], dynamic_axes={'input': {0: 'seq'}}, test_with_inputs=(torch.randn(5, 5),)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_len_list(self): class LenListModel(torch.jit.ScriptModule): @@ -3618,7 +3559,6 @@ def forward(self, input): x = torch.randn(5, 4, 3) self.run_test(SplitModel3(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() def test_split_size_as_list(self): @@ -3635,7 +3575,6 @@ def forward(self, input, split_sizes: List[int]): split_sizes = [torch.tensor(2), torch.tensor(4)] self.run_test(SplitModel(), (x, split_sizes)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_split_size_with_slice(self): class SplitModule(torch.nn.Module): @@ -3707,7 +3646,6 @@ def forward(self, x): x = torch.randn(4, 5, 6) self.run_test(ConcatDynamicModel(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_stack(self): class StackModel(torch.nn.Module): def forward(self, x, y, z): @@ -3718,7 +3656,6 @@ def forward(self, x, y, z): z = torch.randn(3, 4, 5) self.run_test(StackModel(), (x, y, z)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_stack_dynamic(self): class StackDynamicModel(torch.jit.ScriptModule): @@ -3795,7 +3732,6 @@ def forward(self, x): x = torch.randn(5, 3, 3) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_loop_multi_dim(self): class LoopMultiDimModel(torch.jit.ScriptModule): @@ -3810,7 +3746,6 @@ def forward(self, x, y): y = torch.ones(1, dtype=torch.long) self.run_test(model, (x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_list(self): class ListModel(torch.jit.ScriptModule): @@ -3832,7 +3767,6 @@ def forward(self, x): inputs = torch.randn(16, 1) self.run_test(model, inputs) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_tensor_factories(self): class TensorFactory(torch.nn.Module): @@ -3842,7 +3776,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(TensorFactory(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_tensor_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @@ -3853,7 +3786,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(TensorFactory(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_tensor_like_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @@ -3866,7 +3798,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(TensorFactory(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_eye(self): class TensorFactory(torch.nn.Module): @@ -3889,7 +3820,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Zero_(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_new_zeros(self): class Zero_(torch.nn.Module): @@ -3913,7 +3843,6 @@ def forward(self, input): x = torch.randn(2, 3) self.run_test(List(), (x,)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() def test_list_pass(self): @@ -3953,7 +3882,6 @@ def forward(self, x, y): y = torch.randn(1, 2, 3) self.run_test(List(), (x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_new_empty(self): class Emtpy(torch.nn.Module): @@ -3963,7 +3891,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Emtpy(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_new_full(self): class Full(torch.nn.Module): @@ -3973,7 +3900,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Full(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_list(self): class Arithmetic(torch.jit.ScriptModule): @@ -4070,7 +3996,6 @@ def forward(self, x): x = torch.arange(16).view(2, 2, 4).to(torch.float32) self.run_test(MaskedFillModel2(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_masked_scatter(self): class MaskedScatterModel(torch.nn.Module): @@ -4089,7 +4014,6 @@ def forward(self, x): x = torch.randn(3, 4, 5, requires_grad=True) self.run_test(MaskedSelectModel(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # dtype not available def test_index_put_to_masked_fill(self): @@ -4104,7 +4028,6 @@ def forward(self, input_mask, some_const): constant = torch.tensor(5, dtype=torch.float) self.run_test(MaskedFillModel(), (mask, constant)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # dtype not available def test_index_put_to_masked_scatter(self): @@ -4225,7 +4148,6 @@ def forward(self, x): x = torch.randn(4, 2, 3, requires_grad=True) self.run_test(NormModel(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_unfold(self): class UnfoldModel(torch.nn.Module): def forward(self, x): @@ -4238,7 +4160,6 @@ def forward(self, x): input_names=['x'], test_with_inputs=[y]) - @skipIfUnsupportedOpsetVersion([13]) @skipIfONNXShapeInference(False) def test_unfold_infer_shape(self): class UnfoldModule(torch.jit.ScriptModule): @@ -4254,7 +4175,6 @@ def forward(self, x): x = torch.randn(32, 3, 64) self.run_test(UnfoldModule(), x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_unfold_dynamic_inputs(self): class UnfoldModel(torch.nn.Module): @@ -4264,7 +4184,6 @@ def forward(self, x): x = torch.randn(4, 2, 4, requires_grad=True) self.run_test(UnfoldModel(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_prelu(self): class PReluModel(torch.nn.Module): def __init__(self): @@ -4594,7 +4513,6 @@ def forward(self, input, other): model = MyModule() self.run_test(model, (x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_ones_bool(self): class MyModule(torch.nn.Module): @@ -4641,7 +4559,6 @@ def test_constant_pad(self): self.run_test(model, x) # Dynamic padding is added in opset 11 - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # Functional module not scriptable def test_pad_types(self): @@ -4846,7 +4763,6 @@ def test_replication_pad(self): x = torch.randn(2, 2, 4, 4) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_im2col(self): class Unfold(torch.nn.Module): @@ -4870,7 +4786,6 @@ def forward(self, x): # This test checks output scalar type in the ONNX graph should not be null # https://github.com/pytorch/pytorch/issues/28607 - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(10) def test_trace_script(self): @torch.jit.script @@ -4944,7 +4859,6 @@ def forward(self, *tensor_list): x = torch.randn(3, 4) self.run_test(EinsumModelTranspose(), input=(x,)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_crossentropyloss(self): for ignore_index in [-100, 1]: @@ -5043,7 +4957,6 @@ def forward(self, input, target): self.run_test(CrossEntropyLossMeanWeight(ignore_index), input=(x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) def test_kldiv_loss(self): @@ -5110,7 +5023,6 @@ def forward(self, input, target): self.run_test(KLDivLossMiniBatchMean(), input=(x, y)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss(self): class NLLModel(torch.nn.Module): @@ -5131,7 +5043,6 @@ def forward(self, input, target): target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_none(self): class NLLModel(torch.nn.Module): @@ -5153,7 +5064,6 @@ def forward(self, input, target): target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_mean(self): class NLLModel(torch.nn.Module): @@ -5175,7 +5085,6 @@ def forward(self, input, target): target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_sum(self): class NLLModel(torch.nn.Module): @@ -5197,7 +5106,6 @@ def forward(self, input, target): target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_mean_weights(self): class NLLModel(torch.nn.Module): @@ -5219,7 +5127,6 @@ def forward(self, input, target): target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_mean_ignore_index(self): class NLLModel(torch.nn.Module): @@ -5238,7 +5145,6 @@ def forward(self, input, target): target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) self.run_test(NLLModel(), (input, target)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(12) def test_nllloss_2d_mean_ignore_index_weights(self): class NLLModel(torch.nn.Module): @@ -5267,7 +5173,6 @@ def forward(self, mat1, mat2): mat2 = torch.randn(3, 3) self.run_test(M(), input=(mat1, mat2)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) # Because where op is not supported for opset < 9. def test_where_with_bool_tensor(self): class M(torch.nn.Module): @@ -5279,7 +5184,6 @@ def forward(self, mat1, mat2): mat2 = torch.ones(2, 3) self.run_test(M(), input=(mat1, mat2)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) # Because where op is not supported for opset < 9. def test_where_with_byte_tensor(self): class M(torch.nn.Module): @@ -5478,7 +5382,6 @@ def forward(self, x): @skipIfONNXShapeInference(False) @skipIfUnsupportedMinOpsetVersion(13) - @skipIfUnsupportedOpsetVersion([13]) def test_if_list(self): class IfModel(torch.nn.Module): def forward(self, x, y, cond): @@ -5669,7 +5572,6 @@ def forward(self, input): x = torch.randn(6, 4, 3, 3) self.run_test(FakeQuantizePerTensorModel(), (x)) - @skipIfUnsupportedOpsetVersion([13]) def test_batchnorm_training(self): class MyModule(torch.nn.Module): def __init__(self): @@ -5793,7 +5695,6 @@ def forward(self, x): np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01) - @skipIfUnsupportedOpsetVersion([13]) def test_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): @@ -5992,7 +5893,6 @@ def forward(self, boxes, scores): self.run_test(Module(), (boxes, scores)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_clip_boxes_to_image(self): boxes = torch.randn(5, 4) * 500 @@ -6050,7 +5950,6 @@ def test_roi_pool(self): model = ops.RoIPool((pool_h, pool_w), 2) self.run_test(model, (x, rois)) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_resize_images(self): class TransformModule(torch.nn.Module): @@ -6067,7 +5966,6 @@ def forward(self, images): input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]}, test_with_inputs=[(input_test,)]) - @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_transform_images(self): @@ -6198,7 +6096,6 @@ def make_test(name, base, layer, bidirectional, initial_state, # Cannot export with older opsets because of 'ConstantFill' op # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime - @skipIfUnsupportedOpsetVersion([13]) @disableScriptTest() # Test code not scriptable @skipIfUnsupportedMinOpsetVersion(9) def f(self): diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 5c1bfe8b5515..3aadabf85769 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -5,7 +5,7 @@ from torch.onnx import utils, OperatorExportTypes, TrainingMode from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type import torch.utils.cpp_extension -from test_pytorch_common import skipIfUnsupportedMinOpsetVersion +from test_pytorch_common import skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion import caffe2.python.onnx.backend as backend from verify import verify @@ -618,6 +618,8 @@ def forward(self, x): assert next(iter).kind() == "aten::quantize_per_tensor" assert next(iter).kind() == "aten::dequantize" + # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11 + @skipIfUnsupportedOpsetVersion([11, 12]) def test_prim_fallthrough(self): # Test prim op class PrimModule(torch.jit.ScriptModule): diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp index a14dcd611dd8..aca08331183c 100644 --- a/torch/csrc/jit/passes/onnx/helper.cpp +++ b/torch/csrc/jit/passes/onnx/helper.cpp @@ -97,5 +97,28 @@ Value* addInputToBlock(Block* block) { return block->addInput(); } +Node* createONNXUnsqueeze( + Graph* graph, + Node* n_to_insert_before, + Value* input, + int axis, + int opset_version) { + Node* unsqueeze_node = graph->create(onnx::Unsqueeze, 1); + unsqueeze_node->addInput(input); + unsqueeze_node->insertBefore(n_to_insert_before); + if (opset_version >= OPSET_VERSION_13) { + // ONNX spec sets `axes` as input for opset >= 13. + Node* unsqueeze_axes = graph->create(onnx::Constant, 1); + unsqueeze_axes->insertBefore(unsqueeze_node); + unsqueeze_axes->t_( + attr::value, at::unsqueeze(at::scalar_to_tensor(at::Scalar(axis)), 0)); + unsqueeze_node->addInput(unsqueeze_axes->output()); + } else { + // ONNX spec sets `axes` as attribute for opset < 13. + unsqueeze_node->is_(attr::axes, {0}); + } + return unsqueeze_node; +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h index e27909ff6362..43989bd8e6c3 100644 --- a/torch/csrc/jit/passes/onnx/helper.h +++ b/torch/csrc/jit/passes/onnx/helper.h @@ -13,6 +13,7 @@ static const int OPSET_VERSION_9 = 9; static const int OPSET_VERSION_10 = 10; static const int OPSET_VERSION_11 = 11; static const int OPSET_VERSION_12 = 12; +static const int OPSET_VERSION_13 = 13; using ValueToParamPairMap = std::map>; @@ -33,5 +34,13 @@ Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef inputs); Value* addInputToBlock(Block* block); TORCH_API c10::optional ONNXTypeToATenType(int32_t onnx_type); + +Node* createONNXUnsqueeze( + Graph* graph, + Node* n_to_insert_before, + Value* input, + int axis, + int opset_version); + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp index d488201a8f80..8aa07332cc65 100644 --- a/torch/csrc/jit/passes/onnx/peephole.cpp +++ b/torch/csrc/jit/passes/onnx/peephole.cpp @@ -416,10 +416,8 @@ void fixDefaultRNNState( batch_size->addInput(shape_of_input->outputs()[0]); batch_size->addInput(gather_indices->outputs()[0]); - Node* unsqueezed_batch_size = graph->create(onnx::Unsqueeze, 1); - unsqueezed_batch_size->insertBefore(n); - unsqueezed_batch_size->addInput(batch_size->outputs()[0]); - unsqueezed_batch_size->is_(attr::axes, {0}); + Node* unsqueezed_batch_size = + createONNXUnsqueeze(graph, n, batch_size->outputs()[0], 0, opset_version); Node* hidden_size = graph->create(onnx::Constant, 1); hidden_size->insertBefore(n); @@ -440,10 +438,8 @@ void fixDefaultRNNState( ? 2 : 1))); - Node* unsqueezed_num_directions = graph->create(onnx::Unsqueeze, 1); - unsqueezed_num_directions->insertBefore(n); - unsqueezed_num_directions->addInput(num_directions->outputs()[0]); - unsqueezed_num_directions->is_(attr::axes, {0}); + Node* unsqueezed_num_directions = createONNXUnsqueeze( + graph, n, num_directions->outputs()[0], 0, opset_version); Node* concated_dims = graph->create(onnx::Concat, 1); concated_dims->insertBefore(n); @@ -555,6 +551,65 @@ static void replaceInputWithList(Node* node, size_t i, ArrayRef to) { } } +static void eraseListConstruct(Block* block, int opset_version); + +static void eraseListConstruct(Node* n, int opset_version) { + for (auto b : n->blocks()) { + eraseListConstruct(b, opset_version); + } + std::vector>> replacements; + + auto block = n->owningBlock(); + size_t i = 0; + for (auto* input : n->inputs()) { + if (input->node()->kind() == prim::ListConstruct) { + auto* lc_node = input->node(); + TypePtr elem = + lc_node->output()->type()->cast()->getElementType(); + if (elem->cast()) { + // ListConstruct Int[] output case, we need to transform to ONNX + // Concat to ensure the output is a single tensor(dynamic) type in + // order to be consumed as inputs + std::vector unsqueezed; + Graph* g = block->owningGraph(); + for (auto* input : lc_node->inputs()) { + Node* unsqueezed_node = + createONNXUnsqueeze(g, lc_node, input, 0, opset_version); + unsqueezed.emplace_back(unsqueezed_node->output()); + } + Node* concat_node = g->create(onnx::Concat, 1); + concat_node->i_(attr::axis, 0); + for (auto v : unsqueezed) { + concat_node->addInput(v); + } + concat_node->insertBefore(lc_node); + + // make concat node output as new input, then ListConstruct should + // become dead + replacements.emplace_back( + i, std::vector({concat_node->output()})); + + } else { + if (opset_version >= OPSET_VERSION_11) { + c10::Symbol seq_node_kind = lc_node->inputs().size() > 0 + ? onnx::SequenceConstruct + : onnx::SequenceEmpty; + Node* seq_node = block->owningGraph()->create( + seq_node_kind, {lc_node->inputs()}, 1); + seq_node->insertBefore(lc_node); + seq_node->output()->copyMetadata(lc_node->output()); + lc_node->replaceAllUsesWith(seq_node); + } + } + } + i++; + } + + for (auto ritr = replacements.rbegin(); ritr != replacements.rend(); ++ritr) { + replaceInputWithList(n, std::get<0>(*ritr), std::get<1>(*ritr)); + } +} + static void eraseListConstruct(Block* block, int opset_version) { // TODO: Fix this pass/maybe get rid of this part. // Tensor lists might be used for meshgrid and such ops as well. @@ -563,71 +618,9 @@ static void eraseListConstruct(Block* block, int opset_version) { Node* n = *it; ++it; - for (auto b : n->blocks()) { - eraseListConstruct(b, opset_version); - } - std::vector>> replacements; - - size_t i = 0; - for (auto* input : n->inputs()) { - if (input->node()->kind() == prim::ListConstruct) { - auto* lc_node = input->node(); - TypePtr elem = - lc_node->output()->type()->cast()->getElementType(); - if (elem->cast()) { - // ListConstruct Int[] output case, we need to transform to ONNX - // Concat to ensure the output is a single tensor(dynamic) type in - // order to be consumed as inputs - std::vector unsqueezed; - Graph* g = block->owningGraph(); - for (auto* input : lc_node->inputs()) { - Node* unsqueezed_node = g->create(onnx::Unsqueeze, 1); - unsqueezed_node->insertBefore(lc_node); - unsqueezed_node->addInput(input); - unsqueezed_node->is_(attr::axes, {0}); - unsqueezed.emplace_back(unsqueezed_node->output()); - } - Node* concat_node = g->create(onnx::Concat, 1); - concat_node->i_(attr::axis, 0); - for (auto v : unsqueezed) { - concat_node->addInput(v); - } - concat_node->insertBefore(lc_node); - - // make concat node output as new input, then ListConstruct should - // become dead - replacements.emplace_back( - i, std::vector({concat_node->output()})); - - } else { - if (opset_version < OPSET_VERSION_11) { - // Tensor lists are used mostly for inputs to cat/stack. They are - // already handled in those symbolics, and should become dead - // afterwards. - replacements.emplace_back( - i, - std::vector( - lc_node->inputs().begin(), lc_node->inputs().end())); - } else { - c10::Symbol seq_node_kind = lc_node->inputs().size() > 0 - ? onnx::SequenceConstruct - : onnx::SequenceEmpty; - Node* seq_node = block->owningGraph()->create( - seq_node_kind, {lc_node->inputs()}, 1); - seq_node->insertBefore(lc_node); - seq_node->output()->copyMetadata(lc_node->output()); - lc_node->replaceAllUsesWith(seq_node); - } - } - } - i++; - } - - for (auto ritr = replacements.rbegin(); ritr != replacements.rend(); - ++ritr) { - replaceInputWithList(n, std::get<0>(*ritr), std::get<1>(*ritr)); - } + eraseListConstruct(n, opset_version); } + eraseListConstruct(block->return_node(), opset_version); } // For ops such as meshgrid where output is a list of Tensors diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 8d2ff1e63cfc..07d9340ade3a 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -201,7 +201,10 @@ bool IsSupportedNode(const Node* n) { return true; } -Value* CloneValueFromListConstruct(Value* v, std::shared_ptr n_graph) { +Value* CloneValueFromListConstruct( + Value* v, + std::shared_ptr n_graph, + int opset_version) { auto lc_node = v->node(); TORCH_INTERNAL_ASSERT(lc_node->kind() == ::c10::prim::ListConstruct); // In jit/passes/onnx/peephole.cpp::eraseListConstruct, @@ -221,12 +224,10 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr n_graph) { // order to be consumed as inputs std::vector unsqueezed; for (auto* input : lc_node->inputs()) { - Node* unsqueezed_node = - n_graph->insertNode(n_graph->create(::c10::onnx::Unsqueeze, 1)); auto new_input = n_graph->addInput(); new_input->copyMetadata(input); - unsqueezed_node->addInput(new_input); - unsqueezed_node->is_(attr::axes, {0}); + Node* unsqueezed_node = createONNXUnsqueeze( + n_graph.get(), n_graph->return_node(), new_input, 0, opset_version); unsqueezed.emplace_back(unsqueezed_node->output()); } Node* concat_node = @@ -261,11 +262,12 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr n_graph) { Node* CloneNodeToGraph( Node* n, std::shared_ptr n_graph, - const ParamMap& params_dict) { + const ParamMap& params_dict, + int opset_version) { auto vals_to_params_map = buildValueToParamsMap(n->owningGraph()->block(), params_dict); - auto clone_node = - n_graph->createClone(n, [&n_graph, &vals_to_params_map](Value* v) { + auto clone_node = n_graph->createClone( + n, [&n_graph, &vals_to_params_map, opset_version](Value* v) { auto v_n = v->node(); switch (v_n->kind()) { case ::c10::onnx::Constant: { @@ -275,7 +277,7 @@ Node* CloneNodeToGraph( return constant_n->output(); } case ::c10::prim::ListConstruct: { - return CloneValueFromListConstruct(v, n_graph); + return CloneValueFromListConstruct(v, n_graph, opset_version); } case ::c10::prim::PackPadded: { auto input = n_graph->addInput(); @@ -476,7 +478,7 @@ void ONNXShapeTypeInference( // Create a Graph containing only the single node n. // This graph is later converted to ONNX to run shape inference. auto n_graph = std::make_shared(); - auto clone_node = CloneNodeToGraph(n, n_graph, params_dict); + auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version); n_graph->insertNode(clone_node); // Register all node outputs as graph outputs. @@ -507,12 +509,16 @@ void ONNXShapeTypeInference( } catch (std::runtime_error& ex) { // TODO: include this as warning once we have a more consolidated warning // system. + GRAPH_DEBUG( + "ONNX shape inference fails with: ", + ex.what(), + " on graph: ", + n_graph->toString()); const char shape_err[] = "ShapeInferenceError"; const char type_err[] = "TypeInferenceError"; if ((strstr(ex.what(), shape_err) == NULL) && (strstr(ex.what(), type_err) == NULL)) throw; - GRAPH_DEBUG("ONNX shape inference fails with: ", ex.what()); } GRAPH_DEBUG( "ONNX graph after shape inference: ", prettyPrint(*model_proto)); diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 187dcfcb87e2..8794c5f115c5 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -321,9 +321,19 @@ def _interpolate_warning(interpolate_mode): "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n" "We recommend using opset 11 and above for models using this operator. ") -def _unsqueeze_helper(g, input, dim): - from torch.onnx.symbolic_opset9 import unsqueeze - return unsqueeze(g, input, dim) +def _unsqueeze_helper(g, input, axes_i): + if _export_onnx_opset_version >= 13: + axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long)) + return g.op("Unsqueeze", input, axes) + else: + return g.op("Unsqueeze", input, axes_i=axes_i) + +def _squeeze_helper(g, input, axes_i): + if _export_onnx_opset_version >= 13: + axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long)) + return g.op("Squeeze", input, axes) + else: + return g.op("Squeeze", input, axes_i=axes_i) def _interpolate_size_to_scales(g, input, output_size, dim): output_size = _maybe_get_const(output_size, 'is') @@ -371,7 +381,7 @@ def _interpolate_get_scales(g, scale_factor, dim): if isinstance(scale_factor.type(), torch._C.ListType) or (scale_factor_rank is not None and scale_factor_rank > 0): return g.op("Concat", offsets, scale_factor, axis_i=0) else: - scale_factor = _unsqueeze_helper(g, scale_factor, 0) + scale_factor = _unsqueeze_helper(g, scale_factor, [0]) scale_factor = g.op("Cast", scale_factor, to_i=cast_pytorch_to_onnx["Float"]) scales = [scale_factor for i in range(dim - 2)] scale_factor = g.op("Concat", offsets, *scales, axis_i=0) @@ -400,7 +410,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_ if not _is_packed_list(size): is_scalar = ((_maybe_get_const(size, 't').dim() == 0)) if is_scalar: - size = _unsqueeze_helper(g, size, 0) + size = _unsqueeze_helper(g, size, [0]) size = [size for i in range(dim - 2)] size = g.op("Concat", *size, axis_i=0) scale_factor = _interpolate_size_to_scales(g, input, size, dim) @@ -477,9 +487,9 @@ def _index_fill_reshape_helper(g, self, dim, index): return _unimplemented("index_fill", "input rank not accesible") self_dim = self.type().dim() dim_value = _parse_arg(dim, 'i') - unsqueezed_index = g.op("Unsqueeze", index, axes_i=[i for i in range(self_dim) if i != dim_value]) + unsqueezed_index = _unsqueeze_helper(g, index, [i for i in range(self_dim) if i != dim_value]) expanded_index_shape = scatter(g, g.op("Shape", self), 0, - g.op("Unsqueeze", dim, axes_i=[0]), g.op("Shape", index)) + _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)) expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None) return expanded_index_shape, expanded_index diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index 6558df6e3d4c..349eb8f35565 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -136,11 +136,11 @@ def __interpolate(g, input, size, scale_factor, mode , align_corners, recompute_ def _slice(g, input, axes, starts, ends, steps=None, dynamic_slice=False): if dynamic_slice: - starts = g.op("Unsqueeze", starts, axes_i=[0]) - ends = g.op("Unsqueeze", ends, axes_i=[0]) + starts = sym_help._unsqueeze_helper(g, starts, [0]) + ends = sym_help._unsqueeze_helper(g, ends, [0]) if isinstance(axes, int): axes = g.op("Constant", value_t=torch.tensor(axes)) - axes = g.op("Unsqueeze", axes, axes_i=[0]) + axes = sym_help._unsqueeze_helper(g, axes, [0]) else: assert len(starts) == len(ends) assert len(starts) == len(axes) @@ -220,15 +220,15 @@ def embedding_bag(g, offsets_extended = g.op("Concat", *offsets_extended, axis_i=0) list_ = [] for i in range(offset_len): - start_ = g.op("Unsqueeze", select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), axes_i=[0]) - end_ = g.op("Unsqueeze", select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)), axes_i=[0]) + start_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), [0]) + end_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)), [0]) axes_ = g.op("Constant", value_t=torch.tensor([0])) indices_row = g.op("Slice", indices, start_, end_, axes_) embeddings = g.op("Gather", embedding_matrix, indices_row) if not sym_help._is_none(per_sample_weights): per_sample_weights_row = g.op("Slice", per_sample_weights, start_, end_, axes_) - per_sample_weights_row = g.op("Unsqueeze", per_sample_weights_row, axes_i=[1]) + per_sample_weights_row = sym_help._unsqueeze_helper(g, per_sample_weights_row, [1]) embeddings = g.op("Mul", embeddings, per_sample_weights_row) if mode == 0: embeddings = g.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0) @@ -237,7 +237,7 @@ def embedding_bag(g, else: embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0) - embeddings = g.op("Unsqueeze", embeddings, axes_i=[0]) + embeddings = sym_help._unsqueeze_helper(g, embeddings, [0]) list_.append(embeddings) output = g.op("Concat", *list_, axis_i=0) diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 85c7bf97c883..82394bfebe2f 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -76,7 +76,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False): index = add(g, index, ind) broadcast_index_shape = g.op("Shape", index) indices_list = [ - g.op("Unsqueeze", expand(g, ind, broadcast_index_shape, None), axes_i=[-1]) for ind in indices_list + sym_help._unsqueeze_helper(g, expand(g, ind, broadcast_index_shape, None), [-1]) for ind in indices_list ] index = g.op("Concat", *indices_list, axis_i=-1) else: @@ -180,7 +180,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False): return masked_fill(g, self, bool_inp, values) return masked_scatter(g, self, bool_inp, values) broadcast_index_shape = g.op("Shape", index) - index = g.op("Unsqueeze", index, axes_i=[-1]) + index = sym_help._unsqueeze_helper(g, index, [-1]) sub_data_shape = sym_help._slice_helper( g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[maxsize]) values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0) @@ -284,7 +284,7 @@ def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_s if rank is None: return sym_help._unimplemented("interpolate (with a scalar output_size)", "missing input shape (try giving an array of output_size values)") - size = unsqueeze(g, size, 0) + size = sym_help._unsqueeze_helper(g, size, [0]) size = [size for i in range(rank - 2)] size = g.op("Concat", *size, axis_i=0) size = g.op("Cast", size, to_i=sym_help.cast_pytorch_to_onnx['Long']) @@ -376,7 +376,7 @@ def _len(g, self): if _is_tensor_list(self) or self.node().kind() == "onnx::SplitToSequence": return g.op("SequenceLength", self) sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0]))) - return g.op('Squeeze', sz_0, axes_i=[0]) + return sym_help._squeeze_helper(g, sz_0, [0]) def __getitem_(g, self, i): @@ -489,7 +489,7 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None): return split_out # Convert to multiple slice nodes iff number of splits and number of outputs are statically known. if sym_help._is_packed_list(split_size_or_sizes) and len(sym_help._unpack_list(split_size_or_sizes)) == _outputs: - split_sizes = [g.op("Unsqueeze", v, axes_i=[0]) for v in sym_help._unpack_list(split_size_or_sizes)] + split_sizes = [sym_help._unsqueeze_helper(g, v, [0]) for v in sym_help._unpack_list(split_size_or_sizes)] start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long)) axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long)) res = [] @@ -658,7 +658,7 @@ def squeeze(g, self, dim=None): if_node_outputs = g.op("If", cond) if_node = if_node_outputs.node() if_block = torch.onnx.utils._add_block(if_node) - squeeze_ = if_block.op("Squeeze", self, axes_i=[dim]) + squeeze_ = sym_help._squeeze_helper(if_block, self, [dim]) torch.onnx.utils._add_output_to_block(if_block, squeeze_) else_block = torch.onnx.utils._add_block(if_node) identity_ = else_block.op("Identity", self) @@ -673,13 +673,12 @@ def squeeze(g, self, dim=None): "be exported without the squeeze node. If the model is intended to be used with dynamic " + "input shapes, please export with dynamic_axes argument.") return self - return g.op("Squeeze", self, axes_i=[dim]) + return sym_help._squeeze_helper(g, self, [dim]) @parse_args('v', 'i') def unsqueeze(g, self, dim): - return g.op("Unsqueeze", self, axes_i=[dim]) - + return sym_help._unsqueeze_helper(g, self, [dim]) def mm(g, self, other): return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0) @@ -782,7 +781,7 @@ def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding # Broadcast and add kernel staring positions (indices) with # kernel_grid along dim d, to get block indices along dim d - blocks_d_indices = g.op('Unsqueeze', blocks_d_indices, axes_i=[0]) # Reshape to [1, -1] + blocks_d_indices = sym_help._unsqueeze_helper(g, blocks_d_indices, [0]) # Reshape to [1, -1] kernel_mask = g.op('Reshape', kernel_grid, g.op('Constant', value_t=torch.tensor([-1, 1]))) block_mask = g.op("Add", blocks_d_indices, kernel_mask) @@ -804,8 +803,8 @@ def _get_im2col_output_shape(g, input, kernel_h, kernel_w): g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))) return g.op("Concat", - g.op("Unsqueeze", batch_dim, axes_i=[0]), - g.op("Unsqueeze", channel_unfolded, axes_i=[0]), + sym_help._unsqueeze_helper(g, batch_dim, [0]), + sym_help._unsqueeze_helper(g, channel_unfolded, [0]), g.op("Constant", value_t=torch.tensor([-1])), axis_i=0) @@ -901,9 +900,9 @@ def embedding_bag(g, loop_condition = g.op("Cast", loop_condition, to_i=9) zero = g.op("Constant", value_t=torch.tensor([0])) - indices_len = g.op("Unsqueeze", - sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))), - axes_i=[0]) + indices_len = sym_help._unsqueeze_helper(g, + sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))), + [0]) if not include_last_offset: offsets = [offsets, indices_len] offsets = g.op("Concat", *offsets, axis_i=0) @@ -923,8 +922,8 @@ def embedding_bag(g, indices_start = loop_block.op("Gather", offsets_starts, block_input_iter, axis_i=0) indices_end = loop_block.op("Gather", offsets_ends, block_input_iter, axis_i=0) - indices_start = loop_block.op("Unsqueeze", indices_start, axes_i=[0]) - indices_end = loop_block.op("Unsqueeze", indices_end, axes_i=[0]) + indices_start = sym_help._unsqueeze_helper(loop_block, indices_start, [0]) + indices_end = sym_help._unsqueeze_helper(loop_block, indices_end, [0]) indices_row = loop_block.op("Slice", indices, indices_start, indices_end, zero) embeddings = loop_block.op("Gather", embedding_matrix, indices_row, axis_i=0) @@ -933,7 +932,7 @@ def embedding_bag(g, indices_start, indices_end, zero) - per_sample_weights_row = loop_block.op("Unsqueeze", per_sample_weights_row, axes_i=[1]) + per_sample_weights_row = sym_help._unsqueeze_helper(loop_block, per_sample_weights_row, [1]) embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row) if mode == 0: embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0) diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py index cd67fd508fa2..63a40b555c8e 100644 --- a/torch/onnx/symbolic_opset12.py +++ b/torch/onnx/symbolic_opset12.py @@ -132,11 +132,11 @@ def unfold(g, input, dimension, size, step): starts = loop_block.op("Gather", low_indices, block_input_iter) ends = loop_block.op("Gather", hi_indices, block_input_iter) axes = loop_block.op("Constant", value_t=torch.tensor([2])) - starts = loop_block.op("Unsqueeze", starts, axes_i=[0]) - ends = loop_block.op("Unsqueeze", ends, axes_i=[0]) + starts = sym_help._unsqueeze_helper(loop_block, starts, [0]) + ends = sym_help._unsqueeze_helper(loop_block, ends, [0]) stack = loop_block.op("Slice", input, starts, ends, axes) - unsqueeze = loop_block.op("Unsqueeze", loop_block.op("Transpose", stack, perm_i=perm), axes_i=[dimension]) + unsqueeze = sym_help._unsqueeze_helper(loop_block, loop_block.op("Transpose", stack, perm_i=perm), [dimension]) unsqueeze_list.append(unsqueeze) concat = loop_block.op("Concat", *unsqueeze_list, axis_i=0) @@ -148,7 +148,7 @@ def unfold(g, input, dimension, size, step): perm = [0, 1, 2, 3, 4] perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0] transpose = g.op("Transpose", loop_output, perm_i=perm) - squeeze = g.op("Squeeze", transpose, axes_i=[0]) + squeeze = sym_help._squeeze_helper(g, transpose, [0]) return squeeze else: diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index ada731884f76..a69a7be56850 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -186,7 +186,7 @@ def cat(g, tensor_list, dim): @parse_args('v', 'i') def stack(g, tensor_list, dim): - unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in sym_help._unpack_list(tensor_list)] + unsqueezed = [sym_help._unsqueeze_helper(g, t, [dim]) for t in sym_help._unpack_list(tensor_list)] return g.op("Concat", *unsqueezed, axis_i=dim) @@ -592,7 +592,7 @@ def unbind(g, self, dim=0, _outputs=None): outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs) outputs = [outputs] if _outputs == 1 else outputs - squeezed_outputs = [g.op("Squeeze", out, axes_i=[dim]) for out in outputs] + squeezed_outputs = [sym_help._squeeze_helper(g, out, [dim]) for out in outputs] return squeezed_outputs @@ -605,7 +605,7 @@ def select(g, self, dim, index): else: end_index = index + 1 slice_node = sym_help._slice_helper(g, self, axes=[dim], starts=[index], ends=[end_index]) - return g.op("Squeeze", slice_node, axes_i=[dim]) + return sym_help._squeeze_helper(g, slice_node, [dim]) else: return g.op("Gather", self, index, axis_i=dim) @@ -640,7 +640,7 @@ def squeeze(g, self, dim=None): "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on " + "non-singleton dimensions, it is recommended to export this model using opset " + "version 11 or higher.") - return g.op("Squeeze", self, axes_i=[squeeze_dim]) + return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim]) if dim_size > 1: warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". The size of " + "this dimension in the given input is " + str(dim_size) + ". The model will " + @@ -651,12 +651,12 @@ def squeeze(g, self, dim=None): warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". If the model is " + "intended to be used with dynamic input shapes, please use opset version 11 to export the model.") - return g.op("Squeeze", self, axes_i=[squeeze_dim]) + return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim]) def prelu(g, self, weight): self_rank = sym_help._get_tensor_rank(self) if self_rank is not None and self_rank > 2: - weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1))) + weight = sym_help._unsqueeze_helper(g, weight, list(range(1, self_rank - 1))) return g.op("PRelu", self, weight) @@ -674,7 +674,7 @@ def floor(g, input): def _len(g, self): sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0]))) - return g.op('Squeeze', sz_0, axes_i=[0]) + return sym_help._squeeze_helper(g, sz_0, [0]) @parse_args('v', 't', 't') @@ -1356,7 +1356,7 @@ def unfold(g, input, dimension, size, step): ndim = len(sizes) perm = list(range(0, ndim)) perm.append(perm.pop(dimension)) - unsqueeze = [g.op("Unsqueeze", g.op("Transpose", t, perm_i=perm), axes_i=[dimension]) for t in stack] + unsqueeze = [sym_help._unsqueeze_helper(g, g.op("Transpose", t, perm_i=perm), [dimension]) for t in stack] return g.op("Concat", *unsqueeze, axis_i=dimension) else: return _unimplemented("Unfold", "input size not accessible") @@ -1732,14 +1732,14 @@ def eye(g, *args): if len(args) == 5: # aten::eye(n, dtype, layout, device, pin_memory) n, dtype, layout, device, pin_memory = args - dim_size = g.op("Unsqueeze", n, axes_i=[0]) + dim_size = sym_help._unsqueeze_helper(g, n, [0]) shape = g.op("Concat", dim_size, dim_size, axis_i=0) tensor = zeros(g, shape, dtype, layout, device) return g.op("EyeLike", tensor) elif len(args) == 6: # aten::eye(n, m, dtype, layout, device, pin_memory) n, m, dtype, layout, device, pin_memory = args - shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0) + shape = g.op("Concat", sym_help._unsqueeze_helper(g, n, [0]), sym_help._unsqueeze_helper(g, m, [0]), axis_i=0) tensor = zeros(g, shape, dtype, layout, device) return g.op("EyeLike", tensor) else: @@ -1760,9 +1760,9 @@ def slice(g, self, *args): 'is a deprecated experimental op. Please use statically allocated ' 'variables or export to a higher opset version.') else: - start_unsqueezed = g.op("Unsqueeze", start, axes_i=[0]) - end_unsqueezed = g.op("Unsqueeze", end, axes_i=[0]) - dim_unsqueezed = g.op("Unsqueeze", dim, axes_i=[0]) + start_unsqueezed = sym_help._unsqueeze_helper(g, start, [0]) + end_unsqueezed = sym_help._unsqueeze_helper(g, end, [0]) + dim_unsqueezed = sym_help._unsqueeze_helper(g, dim, [0]) return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed) else: start = _parse_arg(start, 'i') @@ -1814,7 +1814,7 @@ def unsqueeze(g, self, dim): else: return _unimplemented('unsqueeze', 'negative axis with unknown input rank') - return g.op("Unsqueeze", self, axes_i=[dim]) + return sym_help._unsqueeze_helper(g, self, axes_i=[dim]) @parse_args('v', 'i', 'i', 'none') @@ -1973,7 +1973,7 @@ def transform_weights_no_bias(layer_index): elif variant == 'GRU' or variant == 'LSTM': weight_ih, weight_hh = \ [reform_weights(g, w, hidden_size, reform_permutation) for w in weights] - return tuple(g.op('Unsqueeze', x, axes_i=[0]) for x in (weight_ih, weight_hh)) + return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh)) def transform_weights(layer_index): weights = layer_weights[layer_index] @@ -1983,7 +1983,7 @@ def transform_weights(layer_index): weight_ih, weight_hh, bias_ih, bias_hh = \ [reform_weights(g, w, hidden_size, reform_permutation) for w in weights] bias_concat = g.op('Concat', bias_ih, bias_hh, axis_i=0) - return tuple(g.op('Unsqueeze', x, axes_i=[0]) for x in (weight_ih, weight_hh, bias_concat)) + return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh, bias_concat)) def retrieve_state(x, start, end): return x if num_layers == 1 else sym_help._slice_helper(g, x, axes=[0], starts=[start], ends=[end]) @@ -2050,7 +2050,7 @@ def retrieve_state(x, start, end): prev_output = g.op('Transpose', prev_output, perm_i=[0, 2, 1, 3]) prev_output = g.op('Reshape', prev_output, g.op('Constant', value_t=torch.LongTensor([0, 0, -1]))) else: - prev_output = g.op('Squeeze', prev_output, axes_i=[1]) + prev_output = sym_help._squeeze_helper(g, prev_output, [1]) h_outs.append(h_out) if variant == 'LSTM': @@ -2382,7 +2382,7 @@ def gather(g, self, dim, index, sparse_grad=False): values = g.op("Constant", value_t=torch.LongTensor([0, 1])) depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim]))) index = g.op("Cast", g.op("OneHot", index, depth, values, axis_i=dim), to_i=sym_help.cast_pytorch_to_onnx[dtype]) - mul = g.op("Mul", g.op("Unsqueeze", self, axes_i=[dim + 1]), index) + mul = g.op("Mul", sym_help._unsqueeze_helper(g, self, [dim + 1]), index) return g.op("ReduceSum", mul, axes_i=[dim], keepdims_i=0) @@ -2477,42 +2477,42 @@ def _get_arange_dtype(dtype): if len(args) == 2: # aten::arange(Scalar end, Tensor out) - end = g.op("Unsqueeze", args[0], axes_i=[0]) + end = sym_help._unsqueeze_helper(g, args[0], [0]) dtype = 4 # default to int64 - arange_tensor = g.op("Squeeze", nonzero(g, ones(g, end, dtype, None, None)), axes_i=[1]) + arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, end, dtype, None, None)), [1]) return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 4: # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out) dtype = 4 # default to int64 - step = g.op("Unsqueeze", args[2], axes_i=[0]) - end = g.op("Unsqueeze", args[1], axes_i=[0]) - start = g.op("Unsqueeze", args[0], axes_i=[0]) + step = sym_help._unsqueeze_helper(g, args[2], [0]) + end = sym_help._unsqueeze_helper(g, args[1], [0]) + start = sym_help._unsqueeze_helper(g, args[0], [0]) range_tensor = g.op("Div", g.op("Sub", end, start), step) - arange_tensor = g.op("Squeeze", nonzero(g, ones(g, range_tensor, None, None, None)), axes_i=[1]) + arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]) arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start) return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 5: # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory) dtype = _get_arange_dtype(args[1]) - end = g.op("Unsqueeze", args[0], axes_i=[0]) - arange_tensor = g.op("Squeeze", nonzero(g, ones(g, end, dtype, *(args[2:]))), axes_i=[1]) + end = sym_help._unsqueeze_helper(g, args[0], [0]) + arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, end, dtype, *(args[2:]))), [1]) return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 6: # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory) dtype = _get_arange_dtype(args[2]) - end = g.op("Unsqueeze", args[1], axes_i=[0]) - start = g.op("Unsqueeze", args[0], axes_i=[0]) + end = sym_help._unsqueeze_helper(g, args[1], [0]) + start = sym_help._unsqueeze_helper(g, args[0], [0]) range_tensor = g.op("Sub", end, start) - arange_tensor = g.op("Add", g.op("Squeeze", nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), axes_i=[1]), start) + arange_tensor = g.op("Add", sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]), start) return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 7: # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory) dtype = _get_arange_dtype(args[3]) - step = g.op("Unsqueeze", args[2], axes_i=[0]) - end = g.op("Unsqueeze", args[1], axes_i=[0]) - start = g.op("Unsqueeze", args[0], axes_i=[0]) + step = sym_help._unsqueeze_helper(g, args[2], [0]) + end = sym_help._unsqueeze_helper(g, args[1], [0]) + start = sym_help._unsqueeze_helper(g, args[0], [0]) range_tensor = g.op("Div", g.op("Sub", end, start), step) - arange_tensor = g.op("Squeeze", nonzero(g, ones(g, range_tensor, dtype, *(args[4:]))), axes_i=[1]) + arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, *(args[4:]))), [1]) arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start) return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype]) else: @@ -2541,7 +2541,7 @@ def try_mask_to_index(index): warnings.warn("Exporting aten::index operator with indices of type Byte. " "Only 1-D indices are supported. In any other case, " "this will produce an incorrect ONNX graph.") - index = squeeze(g, nonzero(g, index), dim=1) + index = sym_help._squeeze_helper(g, nonzero(g, index), [1]) return index indices = [try_mask_to_index(idx) for idx in indices] @@ -2730,7 +2730,7 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): # Norm has shape [N, C, *] so we reshape weight and bias to [C, *] axes = list(range(1, input_rank - 1)) - return add(g, mul(g, norm, g.op("Unsqueeze", weight, axes_i=axes)), g.op("Unsqueeze", bias, axes_i=axes)) + return add(g, mul(g, norm, sym_help._unsqueeze_helper(g, weight, axes)), sym_help._unsqueeze_helper(g, bias, axes)) @parse_args('v', 'v', 'i') From 1723ab53c493cf0fb1a3510c7a19fb81f20bc972 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 28/41] [ONNX] Update Reducesum operator for opset 13 (#50532) (#50907) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50907 * udpate symbolic for squeeze/unsqueeze * update c++ unsqueeze/squeeze creation * clang format * enable tests * clang format * remove prints * remove magic number * add helper function * fix build issue * update opset9 symbolic with helper function * fix utility test * fix prim_fallthrough opset skip * enable reducesum opset 13 * enable embedding_bag which contain reducesum op * add ReduceSum helper * remove block_listed_operators * remove local test code * remove embedding_bag() in opset13 file * remove unuse import Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050888 Pulled By: SplitInfinity fbshipit-source-id: 88307af6a7880abf94eac126ec1638e962de8c1f Co-authored-by: BowenBao Co-authored-by: hwangdeyu --- test/onnx/test_pytorch_onnx_onnxruntime.py | 4 -- torch/onnx/symbolic_helper.py | 11 ++++++ torch/onnx/symbolic_opset10.py | 2 +- torch/onnx/symbolic_opset11.py | 2 +- torch/onnx/symbolic_opset13.py | 46 +++++++++++++++++++--- torch/onnx/symbolic_opset9.py | 8 ++-- 6 files changed, 57 insertions(+), 16 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 5b7091b6612f..1c9f97488f27 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -2762,7 +2762,6 @@ def forward(self, input): x = torch.randn(4, 5, dtype=torch.float) self.run_test(ReducedOpModule(), x) - @skipIfUnsupportedOpsetVersion([13]) def test_reduced_sum(self): return self._test_reduced_ops(op=torch.sum) @@ -4319,7 +4318,6 @@ def forward(self, input): @disableScriptTest() # error in propagate as assign input shape @skipIfUnsupportedMinOpsetVersion(10) - @skipIfUnsupportedOpsetVersion([12, 13]) # Due to ONNX Loop shape inference issue def test_embedding_bag(self): model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True) input = torch.randint(10, (7,)) @@ -4336,7 +4334,6 @@ def test_embedding_bag(self): self.run_test(model, (input)) @skipIfUnsupportedMinOpsetVersion(11) - @skipIfUnsupportedOpsetVersion([12, 13]) # Due to ONNX Loop shape inference issue def test_embedding_bag_1d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, offset, weights): @@ -4351,7 +4348,6 @@ def forward(self, embedding_matrix, input, offset, weights): self.run_test(model, (embedding_matrix, x, offset, w)) @skipIfUnsupportedMinOpsetVersion(11) - @skipIfUnsupportedOpsetVersion([12, 13]) # Due to ONNX Loop shape inference issue def test_embedding_bag_2d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, weights): diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 8794c5f115c5..5d11bce82135 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -335,6 +335,17 @@ def _squeeze_helper(g, input, axes_i): else: return g.op("Squeeze", input, axes_i=axes_i) +def _reducesum_helper(g, input, axes_i=None, keepdims_i=1, noop_with_empty_axes_i=0): + keepdims_i = _maybe_get_const(keepdims_i, 'i') + if _export_onnx_opset_version >= 13: + if axes_i: + if not _is_value(axes_i): + axes_i = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long)) + return g.op("ReduceSum", input, axes_i, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i) + return g.op("ReduceSum", input, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i) + else: + return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i) + def _interpolate_size_to_scales(g, input, output_size, dim): output_size = _maybe_get_const(output_size, 'is') if _is_value(output_size): diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index 349eb8f35565..b7f0bb6167b2 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -231,7 +231,7 @@ def embedding_bag(g, per_sample_weights_row = sym_help._unsqueeze_helper(g, per_sample_weights_row, [1]) embeddings = g.op("Mul", embeddings, per_sample_weights_row) if mode == 0: - embeddings = g.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0) + embeddings = sym_help._reducesum_helper(g, embeddings, axes_i=[0], keepdims_i=0) elif mode == 1: embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0) else: diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 82394bfebe2f..86c1bd15b656 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -935,7 +935,7 @@ def embedding_bag(g, per_sample_weights_row = sym_help._unsqueeze_helper(loop_block, per_sample_weights_row, [1]) embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row) if mode == 0: - embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0) + embeddings = sym_help._reducesum_helper(loop_block, embeddings, axes_i=[0], keepdims_i=0) elif mode == 1: embeddings = loop_block.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0) else: diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py index 001a20147c4f..9fffa23a1131 100644 --- a/torch/onnx/symbolic_opset13.py +++ b/torch/onnx/symbolic_opset13.py @@ -2,15 +2,16 @@ # see Note [Edit Symbolic Files] in symbolic_helper.py # This file exports ONNX ops for opset 13 -from torch.onnx.symbolic_helper import _block_list_in_opset import torch import torch.onnx.symbolic_helper as sym_help -from torch.onnx.symbolic_helper import parse_args +from torch.onnx.symbolic_helper import parse_args, _unimplemented +from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input -block_listed_operators = ['embedding_bag'] -for block_listed_op in block_listed_operators: - vars()[block_listed_op] = _block_list_in_opset(block_listed_op) +# EDITING THIS FILE? READ THIS FIRST! +# see Note [Edit Symbolic Files] in symbolic_helper.py + +# This file exports ONNX ops for opset 13 @parse_args('v', 'i', 'none') @@ -38,7 +39,7 @@ def frobenius_norm(g, self, dim=None, keepdim=False): if not sym_help._is_value(dim_val) and len(dim_val) == 0: return g.op("ReduceL2", self, keepdims_i=0) sqr = g.op('Mul', self, self) - sumsqr = g.op('ReduceSum', sqr, dim, keepdims_i=keepdim) + sumsqr = sym_help._reducesum_helper(g, sqr, dim, keepdims_i=keepdim) return g.op('Sqrt', sumsqr) @@ -108,3 +109,36 @@ def unbind(g, self, dim=0, _outputs=None): def glu(g, input, dim): first, second = g.op('Split', input, dim, outputs=2) return g.op('Mul', first, g.op('Sigmoid', second)) + + +def _reduce_op_symbolic(onnx_op_name): + def symbolic(g, self, dim=None, keepdim=None): + self = _maybe_cast_reduce_op_input(g, self) + if dim is None: + # all-reduce path + return g.op(onnx_op_name, self, keepdims_i=0) + else: + keepdim = sym_help._get_const(keepdim, 'i', 'keepdim') + return g.op(onnx_op_name, self, dim, keepdims_i=keepdim) + return symbolic + +def _reduce_with_dtype(onnx_op, name): + symbolic = _reduce_op_symbolic(onnx_op) + + @overload_by_arg_count + def reduce(g, *args, **kwargs): + @parse_args('v', 'none') + def reduce_nodim(g, self, dtype): + if dtype.node().kind() != 'prim::Constant': + return _unimplemented(name, "dtype") + return symbolic(g, self) + + @parse_args('v', 'v', 'i', 'none') + def reduce_dim(g, self, dim, keepdim, dtype): + if dtype.node().kind() != 'prim::Constant': + return _unimplemented(name, "dtype") + return symbolic(g, self, dim, keepdim) + return reduce_nodim, reduce_dim + return reduce + +sum = _reduce_with_dtype('ReduceSum', 'sum') diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index a69a7be56850..031f4505e655 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -753,7 +753,7 @@ def softmax(g, input, dim, dtype=None): input = g.op('Sub', input, g.op('ReduceMax', input, axes_i=[dim], keepdims_i=1)) exp = g.op('Exp', input) - sum = g.op('ReduceSum', exp, axes_i=[dim]) + sum = sym_help._reducesum_helper(g, exp, axes_i=[dim]) softmax = g.op('Div', exp, sum) if dtype and dtype.node().kind() != 'prim::Constant': parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype') @@ -2383,7 +2383,7 @@ def gather(g, self, dim, index, sparse_grad=False): depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim]))) index = g.op("Cast", g.op("OneHot", index, depth, values, axis_i=dim), to_i=sym_help.cast_pytorch_to_onnx[dtype]) mul = g.op("Mul", sym_help._unsqueeze_helper(g, self, [dim + 1]), index) - return g.op("ReduceSum", mul, axes_i=[dim], keepdims_i=0) + return sym_help._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0) @parse_args('v', 'is', 'b', 'i') @@ -2639,7 +2639,7 @@ def try_mask_to_index(index): @parse_args('v', 'is', 'i') def frobenius_norm(g, self, dim=None, keepdim=False): sqr = g.op('Mul', self, self) - sumsqr = g.op('ReduceSum', sqr, axes_i=dim, keepdims_i=keepdim) + sumsqr = sym_help._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim) return g.op('Sqrt', sumsqr) @@ -2805,7 +2805,7 @@ def kl_div(g, input, target, reduction, log_target): elif reduction == 1: return g.op("ReduceMean", output, keepdims_i=0) elif reduction == 2: - return g.op("ReduceSum", output, keepdims_i=0) + return sym_help._reducesum_helper(g, output, keepdims_i=0) else: return sym_help._onnx_unsupported("kl_div with reduction other than none, mean, or sum. Please open a bug to " "request ONNX export support for the missing reduction type.") From b308fb78d1be446bb20bf9b73ee3f6d21f080c31 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 29/41] [ONNX] Add binary_cross_entropy_with_logits op to ONNX opset version 12 (#49675) (#50908) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50908 Fixes #{#47997} Exporting the operator binary_cross_entropy_with_logits to ONNX opset version 12. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050885 Pulled By: SplitInfinity fbshipit-source-id: e4167895eed804739aa50481679500a4d564b360 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 46 ++++++++++++++++++++++ torch/onnx/symbolic_opset12.py | 28 +++++++++++++ 2 files changed, 74 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 1c9f97488f27..42e3e91b5ce4 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -5159,6 +5159,52 @@ def forward(self, input, target): target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) self.run_test(NLLModel(), (input, target)) + + @skipIfUnsupportedMinOpsetVersion(12) + def test_binary_cross_entropy_with_logits(self): + x = torch.randn(5) + y = torch.empty(5).random_(2) + self._bce_logits_loss(x, y) + + x = torch.randn(2, 3, 5, 7) + y = torch.empty(2, 3, 5, 7).random_(2) + weight = torch.tensor([2]) + self._bce_logits_loss(x, y, weight) + + x = torch.FloatTensor([[-0.4089, -1.2471, 0.5907], [-0.4897, -0.8267, -0.7349], [0.5241, -0.1246, -0.4751]]) + y = torch.FloatTensor([[0, 1, 1], [0, 0, 1], [1, 0, 1]]) + pos_weight = torch.empty([3]).random_(2) + self._bce_logits_loss(x, y, pos_weight) + + x = torch.randn(3, 3, 4) + y = torch.empty(3, 3, 4).random_(2) + weight = torch.tensor([3]) + pos_weight = torch.empty([3, 4]).random_(2) + self._bce_logits_loss(x, y, weight, pos_weight) + + def _bce_logits_loss(self, x, y, weight=None, pos_weight=None): + class BCEWithLogitsLossNoneWeights(torch.nn.Module): + def forward(self, input, target, weight, pos_weight): + return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, + pos_weight=pos_weight, reduction='none') + + self.run_test(BCEWithLogitsLossNoneWeights(), input=(x, y, weight, pos_weight)) + + class BCEWithLogitsLossMeanWeights(torch.nn.Module): + def forward(self, input, target, weight, pos_weight): + return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, + pos_weight=pos_weight, reduction='mean') + + self.run_test(BCEWithLogitsLossMeanWeights(), input=(x, y, weight, pos_weight)) + + class BCEWithLogitsLossSumWeights(torch.nn.Module): + def forward(self, input, target, weight, pos_weight): + return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, + pos_weight=pos_weight, reduction='sum') + + self.run_test(BCEWithLogitsLossSumWeights(), input=(x, y, weight, pos_weight)) + + def test_torch_mm(self): class M(torch.nn.Module): def forward(self, mat1, mat2): diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py index 63a40b555c8e..5a926eef5e1d 100644 --- a/torch/onnx/symbolic_opset12.py +++ b/torch/onnx/symbolic_opset12.py @@ -52,6 +52,34 @@ def nll_loss2d(g, self, target, weight, reduction, ignore_index): return nll_loss(g, self, target, weight, reduction, ignore_index) +@parse_args('v', 'v', 'v', 'v', 'i') +def binary_cross_entropy_with_logits(g, input, target, weight, pos_weight, reduction): + from torch.onnx.symbolic_opset9 import sigmoid, log, sub, neg, mul, add + p = g.op("Constant", value_t=torch.tensor([1])) + sig_x = sigmoid(g, input) + log_sig_x = log(g, sig_x) + sub_1_x = sub(g, p, sig_x) + sub_1_y = sub(g, p, target) + log_1_x = log(g, sub_1_x) + if pos_weight is None or sym_help._is_none(pos_weight): + output = neg(g, add(g, mul(g, target, log_sig_x), mul(g, sub_1_y, log_1_x))) + else: + output = neg(g, add(g, mul(g, mul(g, target, log_sig_x), pos_weight), mul(g, sub_1_y, log_1_x))) + + if weight is not None and not sym_help._is_none(weight): + output = mul(g, weight, output) + + reduction = sym_help._maybe_get_const(reduction, 'i') + if reduction == 0: + return output + elif reduction == 1: + return g.op("ReduceMean", output) + elif reduction == 2: + return g.op("ReduceSum", output) + else: + return sym_help._onnx_unsupported("binary_cross_entropy_with_logits with reduction other than none, mean, or sum") + + def celu(g, self, alpha): alpha = sym_help._maybe_get_const(alpha, 'f') # if the input is of type double cast it to float From e90a480d408336258bb7728d023825599587eb5c Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 30/41] [ONNX] Add logical_and, logical_or, logical_xor torch op support in pytorch exporter (#50570) (#50909) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50909 Fixes #{} Add logical_and, logical_or, logical_xor torch op support in pytorch exporter. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050884 Pulled By: SplitInfinity fbshipit-source-id: 2db564e9726c18a3477f9268a0ff862cd2c40e4d --- test/onnx/test_pytorch_onnx_onnxruntime.py | 66 ++++++++++++++++++++++ torch/onnx/symbolic_opset9.py | 15 +++++ 2 files changed, 81 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 42e3e91b5ce4..d4b6a9bf9a2b 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -3217,6 +3217,72 @@ def _test_compare_ops(self, model, num_inputs): self.run_test(model, x_float) self.run_test(model, x_int) + @skipIfUnsupportedMinOpsetVersion(9) + def test_logical_and(self): + class AndModel(torch.nn.Module): + def forward(self, x, y): + return torch.logical_and(x, y) + + x = torch.randint(0, 2, (5, 5), dtype=torch.bool) + y = torch.randint(0, 2, (5, 5), dtype=torch.bool) + self.run_test(AndModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.int32) + y = torch.randint(10, (5, 5), dtype=torch.int32) + self.run_test(AndModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.double) + y = torch.randint(10, (5, 5), dtype=torch.double) + self.run_test(AndModel(), input=(x, y)) + + x = torch.randint(10, (2, 3, 5), dtype=torch.float32) + y = torch.randint(10, (2, 3, 5), dtype=torch.long) + self.run_test(AndModel(), input=(x, y)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_logical_or(self): + class OrModel(torch.nn.Module): + def forward(self, x, y): + return torch.logical_or(x, y) + + x = torch.randint(0, 2, (5, 5), dtype=torch.bool) + y = torch.randint(0, 2, (5, 5), dtype=torch.bool) + self.run_test(OrModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.int32) + y = torch.randint(10, (5, 5), dtype=torch.int32) + self.run_test(OrModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.double) + y = torch.randint(10, (5, 5), dtype=torch.double) + self.run_test(OrModel(), input=(x, y)) + + x = torch.randint(10, (2, 3, 5), dtype=torch.float32) + y = torch.randint(10, (2, 3, 5), dtype=torch.long) + self.run_test(OrModel(), input=(x, y)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_logical_xor(self): + class XorModel(torch.nn.Module): + def forward(self, x, y): + return torch.logical_xor(x, y) + + x = torch.randint(0, 2, (5, 5), dtype=torch.bool) + y = torch.randint(0, 2, (5, 5), dtype=torch.bool) + self.run_test(XorModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.int32) + y = torch.randint(10, (5, 5), dtype=torch.int32) + self.run_test(XorModel(), input=(x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.double) + y = torch.randint(10, (5, 5), dtype=torch.double) + self.run_test(XorModel(), input=(x, y)) + + x = torch.randint(10, (2, 3, 5), dtype=torch.float32) + y = torch.randint(10, (2, 3, 5), dtype=torch.long) + self.run_test(XorModel(), input=(x, y)) + def test_gt(self): class GreaterModel(torch.nn.Module): def forward(self, input, other): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 031f4505e655..11aaeb57404e 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -1105,6 +1105,21 @@ def __or_(g, input, other): return g.op('Or', input, other) +@wrap_logical_op_with_cast_to_and_from('Bool') +def logical_and(g, input, other): + return g.op('And', input, other) + + +@wrap_logical_op_with_cast_to_and_from('Bool') +def logical_or(g, input, other): + return g.op('Or', input, other) + + +@wrap_logical_op_with_cast_to_and_from('Bool') +def logical_xor(g, input, other): + return g.op('Xor', input, other) + + def __rshift_(g, self, other): # make sure to cast other to self's type # (when self is long, make sure that other is not float) From 70dcfe29919c5e695c5f2573c96f57fe47071aa9 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 31/41] [ONNX] Enable _jit_pass_onnx_fold_if only when dynamic_axes is None (#50582) (#50910) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50910 Fixing pytorch/vision#3251 (PR #49410 triggers the torch vision test build failure, on three tests test_faster_rcnn, test_mask_rcnn, test_keypoint_rcnn. ) The offending PR is fine on pytorch UT, because the torchvision and pytorch test has a gap when we merge them - we are using different test API on two sides, therefore causing some discrepancy. This PR bridge the gap for the above three tests, and disable _jit_pass_onnx_fold_if pass until it gets fixed. Allow _jit_pass_onnx_fold_if only when dynamic_axes is None. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050886 Pulled By: SplitInfinity fbshipit-source-id: b765ffe30914261866dcc761f0d0999fd16169e3 --- .jenkins/caffe2/test.sh | 2 +- .jenkins/pytorch/common_utils.sh | 2 +- test/onnx/test_pytorch_onnx_onnxruntime.py | 94 ++++++++++++++++------ torch/onnx/utils.py | 3 +- 4 files changed, 73 insertions(+), 28 deletions(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index e6f43b6452cf..ac131ba738ca 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -160,7 +160,7 @@ pip install --user pytest-sugar if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # Check out torch/vision at Jun 11 2020 commit # This hash must match one in .jenkins/pytorch/test.sh - pip install -q --user git+https://github.com/pytorch/vision.git@e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb + pip install -q --user git+https://github.com/pytorch/vision.git@ae0d80b3c52dc98b3a9763bdb974c3ef7b6eb83d pip install -q --user ninja # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index b28dcb2f41d8..38799ab782de 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -66,7 +66,7 @@ function get_bazel() { chmod +x tools/bazel } -TORCHVISION_COMMIT=e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb +TORCHVISION_COMMIT=ae0d80b3c52dc98b3a9763bdb974c3ef7b6eb83d function install_torchvision() { # Check out torch/vision at Jun 11 2020 commit diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index d4b6a9bf9a2b..0ddfcf1e40ad 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -406,19 +406,7 @@ def run_word_language_model(self, model_name): # Only support CPU version, since tracer is not working in GPU RNN. self.run_test(model, (x, model.hidden)) - @skipIfUnsupportedOpsetVersion([13]) - @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() # Faster RCNN model is not scriptable - def test_faster_rcnn(self): - model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200, - max_size=300) - model.eval() - x = torch.randn(2, 3, 200, 300, requires_grad=True) - self.run_test(model, (x,), rtol=1e-3, atol=1e-5) - self.run_test(model, (x,), input_names=["images_tensors"], output_names=["outputs"], - dynamic_axes={"images_tensors": [0, 1, 2, 3], "outputs": [0, 1, 2, 3]}, rtol=1e-3, atol=1e-5) - - def get_image_from_url(self, url): + def get_image_from_url(self, url, size=(300, 200)): import os from urllib.parse import urlsplit from urllib import request @@ -433,15 +421,40 @@ def get_image_from_url(self, url): with open(path, 'wb') as f: f.write(data) image = Image.open(path).convert("RGB") - image = image.resize((300, 200), Image.BILINEAR) + + image = image.resize(size, Image.BILINEAR) + to_tensor = transforms.ToTensor() return to_tensor(image) def get_test_images(self): image_url = "http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg" - image = self.get_image_from_url(url=image_url) - images = [image] - return images + image = self.get_image_from_url(url=image_url, size=(100, 320)) + + image_url2 = "https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image05.png" + image2 = self.get_image_from_url(url=image_url2, size=(250, 380)) + + return [image], [image2] + + @skipIfUnsupportedOpsetVersion([13]) + @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() # Faster RCNN model is not scriptable + def test_faster_rcnn(self): + model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200, + max_size=300) + model.eval() + x = torch.randn(2, 3, 200, 300, requires_grad=True) + self.run_test(model, (x,), rtol=1e-3, atol=1e-5) + self.run_test(model, (x,), input_names=["images_tensors"], output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2, 3], "outputs": [0, 1, 2, 3]}, rtol=1e-3, atol=1e-5) + dummy_image = [torch.ones(3, 100, 100) * 0.3] + images, test_images = self.get_test_images() + self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)], + input_names=["images_tensors"], output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)], + input_names=["images_tensors"], output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) def test_paste_mask_in_image(self): # disable profiling @@ -480,11 +493,20 @@ def test_paste_mask_in_image(self): def test_mask_rcnn(self): model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn(pretrained=True, min_size=200, max_size=300) - images = self.get_test_images() + images, test_images = self.get_test_images() self.run_test(model, (images,), rtol=1e-3, atol=1e-5) self.run_test(model, (images,), input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + dummy_image = [torch.ones(3, 100, 100) * 0.3] + self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)], + input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], + dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], + "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)], + input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], + dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], + "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) def test_heatmaps_to_keypoints(self): # disable profiling @@ -516,12 +538,33 @@ def test_heatmaps_to_keypoints(self): def test_keypoint_rcnn(self): model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(pretrained=True, min_size=200, max_size=300) - images = self.get_test_images() + images, test_images = self.get_test_images() self.run_test(model, (images,), rtol=1e-3, atol=1e-5) self.run_test(model, (images,), input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], dynamic_axes={"images_tensors": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + dummy_images = [torch.ones(3, 100, 100) * 0.3] + self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_images,)], + input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], + dynamic_axes={"images_tensors": [0, 1, 2]}, + rtol=5e-3, atol=1e-5) + self.run_test(model, (dummy_images,), test_with_inputs=[(dummy_images,), (test_images,)], + input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], + dynamic_axes={"images_tensors": [0, 1, 2]}, + rtol=5e-3, atol=1e-5) + + @skipIfUnsupportedOpsetVersion([13]) + @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() + def test_shufflenet_v2_dynamic_axes(self): + model = torchvision.models.shufflenet_v2_x0_5(pretrained=True) + dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True) + test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True) + self.run_test(model, (dummy_input,), test_with_inputs=[(dummy_input,), (test_inputs,)], + input_names=["input_images"], output_names=["outputs"], + dynamic_axes={"input_images": {0: 'batch_size'}, "output": {0: 'batch_size'}}, + rtol=1e-3, atol=1e-5) @disableScriptTest() def test_word_language_model_RNN_TANH(self): @@ -6016,7 +6059,7 @@ def forward(self, boxes, size): self.run_test(Module(), (boxes, size), input_names=["boxes", "size"], dynamic_axes={"size": [0, 1]}, - test_with_inputs=[(boxes, size_2)]) + test_with_inputs=[(boxes, size), (boxes, size_2)]) @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @@ -6072,7 +6115,7 @@ def forward(self, images): input_test = torch.rand(3, 100, 150) self.run_test(TransformModule(), (input,), input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]}, - test_with_inputs=[(input_test,)]) + test_with_inputs=[(input,), (input_test,)]) @skipIfUnsupportedMinOpsetVersion(11) def test_transform_images(self): @@ -6087,7 +6130,7 @@ def forward(self, images): input = torch.rand(3, 100, 200), torch.rand(3, 200, 200) input_test = torch.rand(3, 100, 200), torch.rand(3, 200, 200) - self.run_test(TransformModule(), (input,), test_with_inputs=[(input_test,)]) + self.run_test(TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)]) def get_features(self, images): s0, s1 = images.shape[-2:] @@ -6104,6 +6147,7 @@ def get_features(self, images): @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) def test_rpn(self): + class RPNModule(torch.nn.Module): def __init__(self): super(RPNModule, self).__init__() @@ -6126,7 +6170,7 @@ def forward(self, images, features): dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], "input3": [0, 1, 2, 3], "input4": [0, 1, 2, 3], "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]}, - test_with_inputs=[(images2, test_features)], + test_with_inputs=[(images, features), (images2, test_features)], dict_check=False) @skipIfUnsupportedOpsetVersion([13]) @@ -6154,7 +6198,7 @@ def forward(self, input, boxes): boxes1 = torch.rand(6, 4) * 256 boxes1[:, 2:] += boxes1[:, :2] - self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i1, [boxes1],)]) + self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i, [boxes],), (i1, [boxes1],)]) @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(11) @@ -6189,7 +6233,7 @@ def forward(self, images, features): input_names=["input1", "input2", "input3", "input4", "input5", "input6"], dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], "input3": [0, 1, 2, 3], "input4": [0, 1, 2, 3], "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]}, - test_with_inputs=[(images2, test_features)], + test_with_inputs=[(images, features), (images2, test_features)], dict_check=False) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 7a483c2f728b..a17f2ea2eb2d 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -209,7 +209,8 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa torch._C._jit_pass_onnx_scalar_type_analysis(graph) torch._C._jit_pass_lint(graph) - torch._C._jit_pass_onnx_fold_if(graph) + if dynamic_axes is None or not bool(dynamic_axes): + torch._C._jit_pass_onnx_fold_if(graph) from torch.onnx.symbolic_helper import _export_onnx_opset_version torch._C._jit_pass_onnx_peephole(graph, _export_onnx_opset_version, fixed_batch_size) From 68034197e87a82988b27cad0c7c521868f8fe987 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 32/41] [ONNX] Support gelu for fp16 export (#50487) (#50911) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50911 Need to replace dtype of export created scalars from float to double. (In torch implicit conversion logic, python numbers are double) Test case skipped in CI due to that current CI job env does not have CUDA support. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050889 Pulled By: SplitInfinity fbshipit-source-id: 1fdde23a68d4793e6b9a82840acc213e5c3aa760 --- scripts/onnx/test.sh | 1 + .../test_pytorch_onnx_onnxruntime_cuda.py | 31 +++++++++++++++++++ torch/onnx/symbolic_opset9.py | 7 ++--- 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 test/onnx/test_pytorch_onnx_onnxruntime_cuda.py diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index 3432ea434928..5e9cfa936064 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -58,6 +58,7 @@ pytest "${args[@]}" \ --ignore "$top_dir/test/onnx/test_utility_funs.py" \ --ignore "$top_dir/test/onnx/test_pytorch_onnx_caffe2.py" \ --ignore "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py" \ + --ignore "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py" \ "${test_paths[@]}" # onnxruntime only support py3 diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py new file mode 100644 index 000000000000..24017b125b10 --- /dev/null +++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py @@ -0,0 +1,31 @@ +import unittest +import onnxruntime # noqa +import torch + +from test_pytorch_common import skipIfUnsupportedMinOpsetVersion +from test_pytorch_common import skipIfNoCuda + +from test_pytorch_onnx_onnxruntime import TestONNXRuntime + +class TestONNXRuntime_cuda(unittest.TestCase): + from torch.onnx.symbolic_helper import _export_onnx_opset_version + opset_version = _export_onnx_opset_version + keep_initializers_as_inputs = True + use_new_jit_passes = True + onnx_shape_inference = True + + @skipIfUnsupportedMinOpsetVersion(9) + @skipIfNoCuda + def test_gelu_fp16(self): + class GeluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.gelu(x) + + x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda')) + self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5) + +TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp +TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test + +if __name__ == '__main__': + unittest.main(TestONNXRuntime_cuda()) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 11aaeb57404e..043fbd041897 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2702,10 +2702,9 @@ def remainder(g, input, other): def gelu(g, self): _sqrt2 = 1.4142135623730951 - erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2))) - erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.float))) - return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.float))) - + erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2, dtype=torch.double))) + erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.double))) + return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.double))) @parse_args('v', 'i', 'v', 'v', 'f', 'i') def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): From 84e9bff85d22b4d23e24c70ef8b1d715502f7877 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 33/41] [ONNX] Replace optional parameters of Resize with placeholder for ops13. (#50574) (#50954) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50954 * Replace optional parameters of Resize with placeholder for ops13. * Use common methods to handle different versions. * Correct flake8 issue. * Update per comments. * Add something to trigger CI again. * Trigger another round of CI. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050882 Pulled By: SplitInfinity fbshipit-source-id: aea6205a1ba4a0621fe1ac9e0c7d94b92b6d8f21 --- torch/onnx/symbolic_helper.py | 127 +++++++++++++++++++++++++++++++++ torch/onnx/symbolic_opset11.py | 94 +----------------------- 2 files changed, 129 insertions(+), 92 deletions(-) diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 5d11bce82135..6b0184f3be4c 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -10,6 +10,7 @@ import torch.onnx.utils from functools import wraps +from torch._C import OptionalType # Note [Edit Symbolic Files] @@ -430,6 +431,126 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_ return scale_factor, mode +def _interpolate_helper(name, dim, interpolate_mode): + def symbolic_fn(g, input, output_size, *args): + scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args) + align_corners = _maybe_get_scalar(align_corners) + coordinate_transformation_mode = "asymmetric" if interpolate_mode == "nearest" \ + else "align_corners" if align_corners else "pytorch_half_pixel" + + if scales is None: + input_size = g.op("Shape", input) + input_size_beg = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0]) + output_size = g.op("Cast", output_size, to_i=cast_pytorch_to_onnx['Long']) + output_size = g.op("Concat", input_size_beg, output_size, axis_i=0) + + if _export_onnx_opset_version >= 13: + empty_roi = _optional_input_placeholder_tensor(g) + empty_scales = _optional_input_placeholder_tensor(g) + else: + empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + + return g.op("Resize", + input, + empty_roi, + empty_scales, + output_size, + coordinate_transformation_mode_s=coordinate_transformation_mode, + cubic_coeff_a_f=-0.75, # only valid when mode="cubic" + mode_s=interpolate_mode, # nearest, linear, or cubic + nearest_mode_s="floor") # only valid when mode="nearest" + else: + if _export_onnx_opset_version >= 13: + empty_roi = _optional_input_placeholder_tensor(g) + else: + empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + + return g.op("Resize", + input, + empty_roi, + scales, + coordinate_transformation_mode_s=coordinate_transformation_mode, + cubic_coeff_a_f=-0.75, # only valid when mode="cubic" + mode_s=interpolate_mode, # nearest, linear, or cubic + nearest_mode_s="floor") # only valid when mode="nearest" + return symbolic_fn + + +def __interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor): + mode = _maybe_get_const(mode, 's') + if 'linear' in mode: + mode = 'linear' + if 'cubic' in mode: + mode = 'cubic' + align_corners = _maybe_get_const(align_corners, 'b') + align_corners = False if not isinstance(align_corners, bool) else align_corners + coordinate_transformation_mode = "asymmetric" if mode == "nearest" \ + else "align_corners" if align_corners else "pytorch_half_pixel" + + if not _is_none(size) : + input_size = g.op("Shape", input) + input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0]) + # in some cases size is not a packed list but size is a scalar + # We need to also verify that (_maybe_get_const(size, 't').dim() == 0) + # but this information is not always available. Try to get the dim, + # and if not assume that it is not a scalar. + try: + is_scalar = not _is_packed_list(size) and ((_maybe_get_const(size, 't').dim() == 0)) + except AttributeError: + is_scalar = not _is_packed_list(size) + if not is_scalar: + warnings.warn("Cannot verify if the output_size is a scalar " + "while exporting interpolate. Assuming that it is not a scalar.") + + if is_scalar: + rank = _get_tensor_rank(input) + if rank is None: + return _unimplemented("interpolate (with a scalar output_size)", + "missing input shape (try giving an array of output_size values)") + size = _unsqueeze_helper(g, size, [0]) + size = [size for i in range(rank - 2)] + size = g.op("Concat", *size, axis_i=0) + size = g.op("Cast", size, to_i=cast_pytorch_to_onnx['Long']) + size = g.op("Concat", input_size, size, axis_i=0) + + if _export_onnx_opset_version >= 13: + empty_roi = _optional_input_placeholder_tensor(g) + empty_scales = _optional_input_placeholder_tensor(g) + else: + empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + + return g.op("Resize", + input, + empty_roi, + empty_scales, + size, + coordinate_transformation_mode_s=coordinate_transformation_mode, + cubic_coeff_a_f=-0.75, # only valid when mode="cubic" + mode_s=mode, # nearest, linear, or cubic + nearest_mode_s="floor") + else: # if not _is_none(scales) + rank = _get_tensor_rank(input) + if rank is None: + return _unimplemented("interpolate (with scales)", "missing input shape") + + if _export_onnx_opset_version >= 13: + empty_roi = _optional_input_placeholder_tensor(g) + else: + empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) + + scales = _interpolate_get_scales(g, scale_factor, rank) + return g.op("Resize", + input, + empty_roi, + scales, + coordinate_transformation_mode_s=coordinate_transformation_mode, + cubic_coeff_a_f=-0.75, # only valid when mode="cubic" + mode_s=mode, # nearest, linear, or cubic + nearest_mode_s="floor") # only valid when mode="nearest" + + def _unbind_helper(g, self, dim, _outputs): if _export_onnx_opset_version <= 9: from torch.onnx.symbolic_opset9 import unbind @@ -546,6 +667,12 @@ def _is_split_static(split_size_or_sizes, _outputs): return False return True +def _optional_input_placeholder_tensor(g): + n = g.op("prim::Constant") + n.setType(OptionalType.ofTensor()) + return n + + # --------------------------------------------------------------------- # ONNX operator version # --------------------------------------------------------------------- diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 86c1bd15b656..3792f77ae377 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -208,38 +208,7 @@ def pixel_shuffle(g, self, upscale_factor): def _interpolate(name, dim, interpolate_mode): - def symbolic_fn(g, input, output_size, *args): - scales, align_corners = sym_help._get_interpolate_attributes(g, interpolate_mode, args) - align_corners = sym_help._maybe_get_scalar(align_corners) - coordinate_transformation_mode = "asymmetric" if interpolate_mode == "nearest" \ - else "align_corners" if align_corners else "pytorch_half_pixel" - empty_tensor = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) - - if scales is None: - input_size = g.op("Shape", input) - input_size_beg = sym_help._slice_helper(g, input_size, axes=[0], ends=[2], starts=[0]) - output_size = g.op("Cast", output_size, to_i=sym_help.cast_pytorch_to_onnx["Long"]) - output_size = g.op("Concat", input_size_beg, output_size, axis_i=0) - scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) - return g.op("Resize", - input, - empty_tensor, # roi only takes effect whith coordinate_transformation_mode="tf_crop_and_resize" - scales, # scales is not needed since we are sending out_size - output_size, - coordinate_transformation_mode_s=coordinate_transformation_mode, - cubic_coeff_a_f=-0.75, # only valid when mode="cubic" - mode_s=interpolate_mode, # nearest, linear, or cubic - nearest_mode_s="floor") # only valid when mode="nearest" - else: - return g.op("Resize", - input, - empty_tensor, # roi only takes effect with coordinate_transformation_mode="tf_crop_and_resize" - scales, # scales is not needed since we are sending out_size - coordinate_transformation_mode_s=coordinate_transformation_mode, - cubic_coeff_a_f=-0.75, # only valid when mode="cubic" - mode_s=interpolate_mode, # nearest, linear, or cubic - nearest_mode_s="floor") # only valid when mode="nearest" - return symbolic_fn + return sym_help._interpolate_helper(name, dim, interpolate_mode) upsample_nearest1d = _interpolate('upsample_nearest1d', 3, "nearest") @@ -252,66 +221,7 @@ def symbolic_fn(g, input, output_size, *args): def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor): - mode = sym_help._maybe_get_const(mode, 's') - if 'linear' in mode: - mode = 'linear' - if 'cubic' in mode: - mode = 'cubic' - align_corners = sym_help._maybe_get_const(align_corners, 'b') - align_corners = False if not isinstance(align_corners, bool) else align_corners - coordinate_transformation_mode = "asymmetric" if mode == "nearest" \ - else "align_corners" if align_corners else "pytorch_half_pixel" - # roi only takes effect with coordinate_transformation_mode="tf_crop_and_resize" - roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) - - if not sym_help._is_none(size) : - input_size = g.op("Shape", input) - input_size = sym_help._slice_helper(g, input_size, axes=[0], ends=[2], starts=[0]) - # in some cases size is not a packed list but size is a scalar - # We need to also verify that (sym_help._maybe_get_const(size, 't').dim() == 0) - # but this information is not always available. Try to get the dim, - # and if not assume that it is not a scalar. - try: - is_scalar = not sym_help._is_packed_list(size) and ((sym_help._maybe_get_const(size, 't').dim() == 0)) - except AttributeError: - is_scalar = not sym_help._is_packed_list(size) - if not is_scalar: - warnings.warn("Cannot verify if the output_size is a scalar " - "while exporting interpolate. Assuming that it is not a scalar.") - - if is_scalar: - rank = sym_help._get_tensor_rank(input) - if rank is None: - return sym_help._unimplemented("interpolate (with a scalar output_size)", - "missing input shape (try giving an array of output_size values)") - size = sym_help._unsqueeze_helper(g, size, [0]) - size = [size for i in range(rank - 2)] - size = g.op("Concat", *size, axis_i=0) - size = g.op("Cast", size, to_i=sym_help.cast_pytorch_to_onnx['Long']) - size = g.op("Concat", input_size, size, axis_i=0) - scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32)) - return g.op("Resize", - input, - roi, - scales, - size, - coordinate_transformation_mode_s=coordinate_transformation_mode, - cubic_coeff_a_f=-0.75, # only valid when mode="cubic" - mode_s=mode, # nearest, linear, or cubic - nearest_mode_s="floor") - else: # if not sym_help._is_none(scales) - rank = sym_help._get_tensor_rank(input) - if rank is None: - return sym_help._unimplemented("interpolate (with scales)", "missing input shape") - scales = sym_help._interpolate_get_scales(g, scale_factor, rank) - return g.op("Resize", - input, - roi, - scales, - coordinate_transformation_mode_s=coordinate_transformation_mode, - cubic_coeff_a_f=-0.75, # only valid when mode="cubic" - mode_s=mode, # nearest, linear, or cubic - nearest_mode_s="floor") # only valid when mode="nearest" + return sym_help.__interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor) @parse_args('v', 'i', 'v', 'v') def gather(g, self, dim, index, sparse_grad=False): From e2eb97dd7682d2810071ce78b76543acc1584a9c Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 27 Jan 2021 17:41:50 -0800 Subject: [PATCH 34/41] [ONNX] Fix param names (#50764) (#50955) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50955 Preserve name of parameters for ONNX. Looks like output->copyMetadata(input) API is giving the same debugName to the output. So the name of the original input is changed. This update avoid the name change by just copying the type. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26050880 Pulled By: SplitInfinity fbshipit-source-id: 8b04e41e6df7f33c5c9c873fb323c21462fc125b --- .../onnx/remove_inplace_ops_for_onnx.cpp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp index bc26183a25bb..c9b42d76973a 100644 --- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp +++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp @@ -571,29 +571,27 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) { << "Warning: ONNX Preprocess - Removing mutation on block inputs. " << "This changes graph semantics." << std::endl; + Node* newNode = nullptr; if (input->type()->kind() == TypeKind::ListType) { // Create an aten::list to clone the list in graph inputs - auto newNode = node->owningGraph()->create(aten::list, 1); - newNode->output()->copyMetadata(input); + newNode = node->owningGraph()->create(aten::list, 1); + newNode->output()->setType(input->type()); newNode->addInput(input); - newNode->insertBefore(node); - node->replaceInput(index, newNode->output()); - input->replaceAllUsesAfterNodeWith(node, newNode->output()); + b->prependNode(newNode); } else { // Create an aten::clone to clone the tensor in graph inputs - auto newNode = node->owningGraph()->create(aten::clone, 1); - newNode->output()->copyMetadata(input); + newNode = node->owningGraph()->create(aten::clone, 1); + newNode->output()->setType(input->type()); newNode->addInput(input); auto* noneNode = node->owningGraph()->create(prim::Constant); noneNode->output()->setType(NoneType::get()); newNode->addInput(noneNode->output()); - - newNode->insertBefore(node); + b->prependNode(newNode); noneNode->insertBefore(newNode); - node->replaceInput(index, newNode->output()); - input->replaceAllUsesAfterNodeWith(node, newNode->output()); } + node->replaceInput(index, newNode->output()); + input->replaceAllUsesAfterNodeWith(node, newNode->output()); } } } From 4fb33f1d3a95e29bc850f3415895a15eac93c38c Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Wed, 27 Jan 2021 19:09:14 -0800 Subject: [PATCH 35/41] Trim profiler file paths (#51192) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51192 Trim profiler file paths when using stack traces Test Plan: python test/test_profiler.py -k test_source ``` SumBackward0 0.02% 6.000us 0.51% 154.000us 154.000us 1 test/test_profiler.py(91): test_source ...conda3/envs/pytorch/lib/python3.8/unittest/case.py(633): _callTestMethod ...r/local/miniconda3/envs/pytorch/lib/python3.8/unittest/case.py(676): run ...al/miniconda3/envs/pytorch/lib/python3.8/unittest/case.py(736): __call__ .../local/miniconda3/envs/pytorch/lib/python3.8/unittest/suite.py(122): run ``` Reviewed By: ngimel Differential Revision: D26113905 Pulled By: ilia-cher fbshipit-source-id: 2b71c31b6c4437855d33013d42d977745e6f489f --- torch/autograd/profiler.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index a3d0da1aef9d..0221d80ba684 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -1569,6 +1569,14 @@ def append(s): append(header_sep) + def trim_path(path, src_column_width): + if len(path) > src_column_width: + offset = len(path) - src_column_width + path = path[offset:] + if len(path) > 3: + path = "..." + path[3:] + return path + event_limit = 0 for evt in events: if event_limit == row_limit: @@ -1629,14 +1637,14 @@ def append(s): if has_stack: src_field = "" if len(evt.stack) > 0: - src_field = evt.stack[0][:src_column_width] + src_field = trim_path(evt.stack[0], src_column_width) row_values.append(src_field) append(row_format.format(*row_values)) if has_stack: empty_headers = [""] * (len(headers) - 1) for entry in evt.stack[1:MAX_STACK_ENTRY]: - append(row_format.format(*(empty_headers + [entry[:src_column_width]]))) + append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)]))) empty_headers.append("") append(row_format.format(*empty_headers)) From ea0d304e2eb28506eb6c5650dcf7ada039c9db7c Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Wed, 27 Jan 2021 19:09:14 -0800 Subject: [PATCH 36/41] Rewrite "ProfilerStep#" in profiler output (#51194) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51194 Aggregate all "ProfilerStep#" together Test Plan: python test/test_profiler.py -k test_kineto_profiler_api Reviewed By: ngimel Differential Revision: D26113907 Pulled By: ilia-cher fbshipit-source-id: 2bc803befc85153f07e770ea3c37b57e2870a1ba --- torch/autograd/profiler.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 0221d80ba684..0e50dcee702e 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -223,7 +223,7 @@ def export_chrome_trace(self, path): '"pid": "CPU functions", ' '"args": {}}, ' % ( - evt.name, + evt.trace_name, evt.time_range.start, evt.time_range.elapsed_us(), evt.thread @@ -241,7 +241,7 @@ def export_chrome_trace(self, path): '"pid": "CPU functions", ' '"id": %s, ' '"cat": "cpu_to_cuda", ' - '"args": {}}, ' % (evt.name, evt.time_range.start, + '"args": {}}, ' % (evt.trace_name, evt.time_range.start, evt.thread, next_id)) f.write('{"name": "%s", ' '"ph": "f", ' @@ -847,10 +847,11 @@ def __init__( self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0, - is_legacy=False, flops=None): + is_legacy=False, flops=None, trace_name=None): self.id: int = id self.node_id: int = node_id self.name: str = name + self.trace_name: str = trace_name if trace_name is not None else self.name self.time_range: Interval = Interval(start_us, end_us) self.thread: int = thread self.fwd_thread: Optional[int] = fwd_thread @@ -1101,6 +1102,18 @@ def filter_name(name): ] return name in filtered_out_names +# Demangles and optionally rewrites the provided event name, +# with_wildcard - whether to replace certain numbered event names +# with a wildcard name to aggregate them together in the profiler table +# output +def rewrite_name(name, with_wildcard=False): + string_table = StringTable() + name = string_table[name] + if with_wildcard: + if name.startswith("ProfilerStep#"): + name = "ProfilerStep*" + return name + # Parsing of kineto profiler events def parse_kineto_results(result): # result.events() has most of the events - PyTorch op-level and device-level events @@ -1120,7 +1133,6 @@ def parse_kineto_results(result): assert start_record is not None, "Invalid profiler output, __start_profile is missing" # Create and return FunctionEvent list - string_table = StringTable() function_events = [] cuda_corr_map: Dict[int, List[torch.autograd.KinetoEvent]] = {} for kineto_event in result.events(): @@ -1142,7 +1154,8 @@ def parse_kineto_results(result): is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id() fe = FunctionEvent( id=kineto_event.correlation_id(), - name=string_table[kineto_event.name()], + name=rewrite_name(name=kineto_event.name(), with_wildcard=True), + trace_name=rewrite_name(name=kineto_event.name(), with_wildcard=False), thread=kineto_event.start_thread_id(), start_us=rel_start_us, end_us=rel_end_us, @@ -1193,7 +1206,6 @@ def get_record_key(record): cuda_records = {} functions = [] record_stack = [] - string_table = StringTable() # cuda start events and the overall profiler start event don't happen # at exactly the same time because we need to record an event on each device @@ -1271,7 +1283,8 @@ def adjusted_time(cuda_record, cuda_records_map): fe = FunctionEvent( id=record.handle(), node_id=record.node_id(), - name=string_table[start.name()], + name=rewrite_name(name=start.name(), with_wildcard=True), + trace_name=rewrite_name(name=start.name(), with_wildcard=False), thread=start.thread_id(), start_us=start_record.cpu_elapsed_us(start), end_us=start_record.cpu_elapsed_us(record), From d14d8c7f7fb64982c082639b49b16d0818e93e7c Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Wed, 27 Jan 2021 19:09:14 -0800 Subject: [PATCH 37/41] Add convenience import (#51195) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51195 Add kineto_available to torch.profiler Test Plan: >>> import torch.profiler >>> torch.profiler.kineto_available() True Reviewed By: ngimel Differential Revision: D26113906 Pulled By: ilia-cher fbshipit-source-id: fe4502d29d10d8bd9459b0504aa0ee856af43acc --- torch/profiler/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py index dabbf91dff90..e0f568d7cc4b 100644 --- a/torch/profiler/__init__.py +++ b/torch/profiler/__init__.py @@ -10,3 +10,4 @@ ''' from .profiler import profile, schedule, ProfilerAction, ProfilerActivity +from torch.autograd import kineto_available From 983b8e6b62fb9bc7260d1c52bbe27e99b771ad56 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Wed, 27 Jan 2021 19:33:26 -0800 Subject: [PATCH 38/41] fake_quant: add a more memory efficient version (#50561) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50561 Not for review yet, a bunch of TODOs need finalizing. tl;dr; add an alternative implementation of `fake_quantize` which saves a ask during the forward pass and uses it to calculate the backward. There are two benefits: 1. the backward function no longer needs the input Tensor, and it can be gc'ed earlier by autograd. On MobileNetV2, this reduces QAT overhead by ~15% (TODO: link, and absolute numbers). We add an additional mask Tensor to pass around, but its size is 4x smaller than the input tensor. A future optimization would be to pack the mask bitwise and unpack in the backward. 2. the computation of `qval` can be done only once in the forward and reused in the backward. No perf change observed, TODO verify with better matrics. TODO: describe in more detail Test Plan: OSS / torchvision / MobileNetV2 ``` python references/classification/train_quantization.py --print-freq 1 --data-path /data/local/packages/ai-group.imagenet-256-smallest-side/prod/ --output-dir ~/nfs/pytorch_vision_tests/ --backend qnnpack --epochs 5 TODO paste results here ``` TODO more Imported from OSS Reviewed By: ngimel Differential Revision: D25918519 fbshipit-source-id: ec544ca063f984de0f765bf833f205c99d6c18b6 --- aten/src/ATen/native/native_functions.yaml | 8 ++ .../cpu/kernels/QuantizedOpKernels.cpp | 34 ++++++++ .../quantized/cuda/fake_quantize_core.cu | 32 ++++++++ .../ATen/native/quantized/fake_quant_affine.h | 10 +++ .../fake_quant_per_tensor_affine.cpp | 81 ++++++++++++++++--- .../pt/quantization_test.py | 14 ++-- test/quantization/test_workflow_module.py | 59 ++++++++++++++ test/test_namedtuple_return_api.py | 4 +- tools/autograd/derivatives.yaml | 3 + torch/overrides.py | 1 + torch/quantization/fake_quantize.py | 16 +++- 11 files changed, 243 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 1856b9a9bf13..1fd04b4255b7 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4642,6 +4642,14 @@ - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor variants: function +- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine_cachemask + +- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor + variants: function + - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor variants: function dispatch: diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 8137049a75c8..5ed6e28663e0 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -2091,6 +2091,38 @@ void fake_quantize_grad_tensor_kernel( }); } +void fake_quantize_tensor_cachemask_kernel( + Tensor& output, + Tensor& mask, + const Tensor& input, + float sc, + int64_t z_point, + int64_t quant_min, + int64_t quant_max) { + float inv_scale = 1.0f / sc; + + auto iter_combined = TensorIteratorConfig() + .check_all_same_dtype(false) + .add_output(output) + .add_output(mask) + .add_input(input) + .build(); + + // TODO(#51090): make it work for other dtypes + iter_combined.for_each([&](char** data, const int64_t* strides, int64_t n) { + for (int64_t i = 0; i < n; i++) { + float* output_val = (float*)(data[0] + i * strides[0]); + bool* mask_val = (bool*)(data[1] + i * strides[1]); + float* input_val = (float*)(data[2] + i * strides[2]); + + const auto qval = static_cast(z_point + std::nearbyint(*input_val * inv_scale)); + *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc; + *mask_val = ((quant_min <= qval) && (qval <= quant_max)); + } + }); + +} + void fake_quantize_learnable_tensor_grad_kernel_cpu( TensorIterator& iter, float scale, @@ -3054,6 +3086,8 @@ REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel); REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu); REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel); +REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, + &fake_quantize_tensor_cachemask_kernel); REGISTER_DISPATCH(qadaptive_avg_pool2d_nhwc_stub, &qadaptive_avg_pool2d_nhwc_kernel); REGISTER_DISPATCH(qadaptive_avg_pool3d_ndhwc_stub, diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu index e2f51398b48f..87937df546a8 100644 --- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu +++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu @@ -68,6 +68,37 @@ void fake_quantize_grad_tensor_kernel_cuda( }); } +void fake_quantize_tensor_cachemask_kernel_cuda( + Tensor& output, + Tensor& mask, + const Tensor& input, + float scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max) { + + float inv_scale = 1.0f / scale; + auto iter = TensorIteratorConfig() + .check_all_same_dtype(false) + .add_output(output) + .add_output(mask) + .add_input(input) + .build(); + + gpu_kernel_multiple_outputs( + iter, + [=] GPU_LAMBDA (float input_val) -> thrust::tuple { + const auto qval = static_cast(std::nearbyint(input_val * inv_scale) + zero_point); + return { + // fake_quantized value + (fminf(quant_max, fmaxf(quant_min, qval)) - zero_point) * scale, + // mask for grad + ((quant_min <= qval) && (qval <= quant_max)) + }; + } + ); +} + void _fake_quantize_grad_learnable_tensor_kernel_cuda( TensorIterator& iter, float scale, @@ -96,6 +127,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda( } REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel_cuda); +REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel_cuda); REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel_cuda); REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &_fake_quantize_grad_learnable_tensor_kernel_cuda); diff --git a/aten/src/ATen/native/quantized/fake_quant_affine.h b/aten/src/ATen/native/quantized/fake_quant_affine.h index 7a90ff57ae1b..6865c75f4a49 100644 --- a/aten/src/ATen/native/quantized/fake_quant_affine.h +++ b/aten/src/ATen/native/quantized/fake_quant_affine.h @@ -26,6 +26,15 @@ using fake_quant_grad_tensor_fn = void (*)( int64_t quant_min, int64_t quant_max); +using fake_quant_tensor_cachemask_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + float sc, + int64_t z_point, + int64_t quant_min, + int64_t quant_max); + using fake_quant_learnable_grad_tensor_fn = void (*)( TensorIterator& iter, float scale, @@ -36,6 +45,7 @@ using fake_quant_learnable_grad_tensor_fn = void (*)( DECLARE_DISPATCH(fake_quant_tensor_fn, fake_quant_tensor_stub); DECLARE_DISPATCH(fake_quant_grad_tensor_fn, fake_quant_grad_tensor_stub); +DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub); DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub); using fake_quant_per_channel_fn = void (*)( diff --git a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp index fb0853cf2ff2..a782033d5002 100644 --- a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp +++ b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp @@ -12,16 +12,19 @@ namespace native { // Use REGISTER_DISPATCH to run CPU and CUDA backend. DEFINE_DISPATCH(fake_quant_tensor_stub); DEFINE_DISPATCH(fake_quant_grad_tensor_stub); +DEFINE_DISPATCH(fake_quant_tensor_cachemask_stub); DEFINE_DISPATCH(fake_quant_grad_learnable_tensor_stub); /* Fake-quantizes the 'inputs' tensor. + Args: - X: Forward input tensor. + self: Forward input tensor. dY: Backward input tensor (_backward op only). scale: scale of per tensor affine quantization zero_point: zero_point of per tensor affine quantization quant_min: minimum quantized value quant_max: maximum quantized value + Returns: Quantized tensor (double dtype). @@ -50,22 +53,15 @@ Tensor fake_quantize_per_tensor_affine( /* Backward path to fake-quantize the 'inputs' tensor. Args: - X: Forward input tensor. dY: Backward input tensor. + X: Forward input tensor. scale: scale of per tensor affine quantization zero_point: zero_point of per tensor affine quantization quant_min: minimum quantized value quant_max: maximum quantized value - quant_delay: Count of global steps for which to delay the quantization. - See note in forward. - iter: The current quantization iteration used for `quant_delay`. + Returns: Quantized tensor (double dtype). - -Notes: - - quant_delay might be set to non-zero to help weights stabilize in the - beginning of the training. - - quantization range [0, 2^bits - 1] */ Tensor fake_quantize_per_tensor_affine_backward( @@ -95,6 +91,71 @@ Tensor fake_quantize_per_tensor_affine_backward( return dX; } +/* Fake-quantizes the 'inputs' tensor, saving a mask for the backward pass. + +This is numerically equivalent to `fake_quantize_per_tensor_affine`, +but has a lower memory overhead in the backward pass. + +Args: + self: Forward input tensor. + scale: scale of per tensor affine quantization + zero_point: zero_point of per tensor affine quantization + quant_min: minimum quantized value + quant_max: maximum quantized value + +Returns: + Quantized tensor (double dtype). + Mask (bool dtype). +*/ +std::tuple fake_quantize_per_tensor_affine_cachemask( + const Tensor& self, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max) { + TORCH_CHECK(self.scalar_type() == ScalarType::Float); + TORCH_CHECK( + quant_min <= quant_max, + "`quant_min` should be less than or \ + equal to `quant_max`."); + TORCH_CHECK( + zero_point >= quant_min && zero_point <= quant_max, + "`zero_point` must be between `quant_min` and `quant_max`."); + + auto Y = at::empty_like(self, self.options(), MemoryFormat::Preserve); + auto mask = at::empty_like(self, at::kBool, MemoryFormat::Preserve); + fake_quant_tensor_cachemask_stub( + self.device().type(), Y, mask, self, scale, zero_point, quant_min, quant_max); + // TODO(future, optional): look into packing the mask further (BoolTensor uses + // 1 byte per element, we only need 1 bit per element). + return std::make_tuple(Y, mask); +} + +/* Backward path to fake-quantize the 'inputs' tensor, with mask. + +Args: + dY: output grad. + mask: mask tensor from the forward pass. + +Returns: + dX (input grad). +*/ +Tensor fake_quantize_per_tensor_affine_cachemask_backward( + const Tensor& dY, + const Tensor& mask) { + TORCH_CHECK(dY.scalar_type() == ScalarType::Float); + TORCH_CHECK(mask.scalar_type() == ScalarType::Bool); + TORCH_CHECK(mask.numel() == dY.numel(), + "`mask` and `dY` are not the same size: ", + "`mask` is size ", mask.numel(), " and `dY` is size ", dY.numel()); + if (dY.numel() <= 0) { + return dY; + } + // Note: no additional kernels needed, since mask is pre-computed + // and we can use the existing tensor multiplication kernels. + return dY * mask; +} + int64_t _get_zero_point_from_tensor( const Tensor& zero_point, int64_t quant_min, diff --git a/benchmarks/operator_benchmark/pt/quantization_test.py b/benchmarks/operator_benchmark/pt/quantization_test.py index af09a5fa2523..a8377fb3e488 100644 --- a/benchmarks/operator_benchmark/pt/quantization_test.py +++ b/benchmarks/operator_benchmark/pt/quantization_test.py @@ -130,35 +130,38 @@ def forward(self, input, scales, zero_points, axis: int, dtype: int): 'attr_names': ['N', 'C', 'H', 'W'], 'attrs': [ [1, 3, 512, 512], - [1, 3, 512, 512] ], 'tags': ['short'] } fake_quantize_configs_long_dict = { 'N': [1], - 'C': [1, 3, 8], + 'C': [1, 3, 8, 32], 'H': [256, 1024], 'W': [256, 1024], 'tags': ['long'] } fake_quantize_configs_short = op_bench.config_list( + cross_product_configs={ + 'device': ('cpu', 'cuda'), + }, **fake_quantize_configs_short_dict ) fake_quantize_configs_long = op_bench.cross_product_configs( + device=('cpu', 'cuda'), **fake_quantize_configs_long_dict ) class FakeQuantizeBenchmark(op_bench.TorchBenchmarkBase): r"""Benchmarks fake quantization with default parameters.""" - def init(self, N, C, H, W): + def init(self, N, C, H, W, device): self.inputs = { - "input": torch.rand(N, C, H, W) + "input": torch.rand(N, C, H, W).to(device) } - self.op = tq.FakeQuantize() + self.op = tq.FakeQuantize().to(device) self.set_module_name('FakeQuantize') def forward(self, input): @@ -169,6 +172,7 @@ def forward(self, input): fake_quantize_configs_short + fake_quantize_configs_long, FakeQuantizeBenchmark) + # op_type is used to describe the type of operator used in benchmarking: # py_module represents the operator written in Python that can # backpropagate on scale and zero point. diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 866e1971ab19..869cd2cf3715 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -861,6 +861,65 @@ def test_backward_per_tensor(self, device, X): Y_prime.backward(dout) np.testing.assert_allclose(dX.cpu(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) + def _test_forward_per_tensor_cachemask_impl(self, device): + for torch_type in (torch.qint8, torch.quint8): + X = torch.randn(4, 8).to(device) + # pick the scale + zp so that some values get clipped + obs = torch.quantization.MinMaxObserver(torch_type) + obs(X * 0.75) + scale, zero_point = obs.calculate_qparams() + scale, zero_point = float(scale), int(zero_point) + quant_min, quant_max = obs._calculate_qmin_qmax() + + Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask( + X, scale, zero_point, quant_min, quant_max) + Y_ref = _fake_quantize_per_tensor_affine_reference( + X.cpu(), scale, zero_point, quant_min, quant_max).to(device) + self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance)) + + def test_forward_per_tensor_cachemask_cpu(self): + device = torch.device('cpu') + self._test_forward_per_tensor_cachemask_impl(device) + + @unittest.skipIf(not TEST_CUDA, "No gpu is not available.") + def test_forward_per_tensor_cachemask_cuda(self): + device = torch.device('cuda') + self._test_forward_per_tensor_cachemask_impl(device) + + def _test_backward_per_tensor_cachemask_impl(self, device): + for torch_type in (torch.qint8, torch.quint8): + X = torch.randn(4, 8).to(device) + X.requires_grad_() + # pick the scale + zp so that some values get clipped + obs = torch.quantization.MinMaxObserver(torch_type) + obs(X * 0.75) + scale, zero_point = obs.calculate_qparams() + scale, zero_point = float(scale), int(zero_point) + quant_min, quant_max = obs._calculate_qmin_qmax() + + # forward pass + Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask( + X, scale, zero_point, quant_min, quant_max) + Y_ref = _fake_quantize_per_tensor_affine_reference( + X.cpu(), scale, zero_point, quant_min, quant_max).to(device) + self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance)) + + # backward pass + dout = torch.rand(X.shape, dtype=torch.float).to(device) + dX = _fake_quantize_per_tensor_affine_grad_reference( + dout, X, scale, zero_point, quant_min, quant_max) + Y_test.backward(dout) + self.assertTrue(torch.allclose(dX, X.grad)) + + def test_backward_per_tensor_cachemask_cpu(self): + device = torch.device('cpu') + self._test_backward_per_tensor_cachemask_impl(device) + + @unittest.skipIf(not TEST_CUDA, "No gpu is not available.") + def test_backward_per_tensor_cachemask_cuda(self): + device = torch.device('cuda') + self._test_backward_per_tensor_cachemask_impl(device) + @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), X=hu.tensor(shapes=hu.array_shapes(1, 5,), elements=hu.floats(-1e3, 1e3, allow_nan=False, allow_infinity=False), diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py index 00432c9e71cd..5ee70c0dacd1 100644 --- a/test/test_namedtuple_return_api.py +++ b/test/test_namedtuple_return_api.py @@ -14,7 +14,7 @@ 'max', 'min', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig', 'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq', 'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual", 'linalg_qr', - '_svd_helper', 'linalg_svd', 'linalg_slogdet', + '_svd_helper', 'linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask', } @@ -68,6 +68,8 @@ def test_namedtuple_return(self): op(operators=['lstsq'], input=(a,), names=('solution', 'QR'), hasout=True), op(operators=['linalg_eigh'], input=("L",), names=('eigenvalues', 'eigenvectors'), hasout=True), op(operators=['linalg_slogdet'], input=(), names=('sign', 'logabsdet'), hasout=True), + op(operators=['fake_quantize_per_tensor_affine_cachemask'], + input=(0.1, 0, 0, 255), names=('output', 'mask',), hasout=False), op(operators=['unpack_dual'], input=(0,), names=('primal', 'tangent'), hasout=False), ] diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index c199d5a4e9df..a9a751abd260 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -461,6 +461,9 @@ - name: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor self: fake_quantize_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max) +- name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask) + - name: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max) : std::tuple()" diff --git a/torch/overrides.py b/torch/overrides.py index 1a5ebfb9a133..187bb0425dc7 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -391,6 +391,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.expm1: lambda input, out=None: -1, torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1, torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1, + torch.fake_quantize_per_tensor_affine_cachemask: lambda input, scale, zero_point, quant_min, quant_max: -1, torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1, torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1, torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1, diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py index 46dba803a1ff..4d29db46acc4 100644 --- a/torch/quantization/fake_quantize.py +++ b/torch/quantization/fake_quantize.py @@ -140,9 +140,19 @@ def forward(self, X): X = torch.fake_quantize_per_channel_affine(X, self.scale, self.zero_point, self.ch_axis, self.quant_min, self.quant_max) else: - X = torch.fake_quantize_per_tensor_affine(X, float(self.scale), - int(self.zero_point), self.quant_min, - self.quant_max) + if self.training: + # During training, use the memory optimized fake_quant + # forward. It has a reduced memory overhead in the backward + # pass compared to fake_quantize_per_tensor_affine. + X, _mask = torch.fake_quantize_per_tensor_affine_cachemask( + X, float(self.scale), int(self.zero_point), + self.quant_min, self.quant_max) + else: + # During inference, use the fastest fake_quant + # which does not compute any extra info for the backward. + X = torch.fake_quantize_per_tensor_affine( + X, float(self.scale), int(self.zero_point), + self.quant_min, self.quant_max) return X @torch.jit.export From 0335222a4ae3219061b36257dde411694ffb3f67 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Wed, 27 Jan 2021 19:33:26 -0800 Subject: [PATCH 39/41] memory efficient fq: use it everywhere, delete the old version (#51159) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51159 This PR is the cleanup after #50561. High level, we make the new definition of fake_quant be the definition used by autograd, but keep the old function around as a thin wrapper to keep the user facing API the same. In detail: 1. point `fake_quantize_per_tensor_affine`'s implementation to be `fake_quantize_per_tensor_affine_cachemask` 2. delete the `fake_quantize_per_tensor_affine` backward, autograd will automatically use the cachemask backward 3. delete all the `fake_quantize_per_tensor_affine` kernels, since they are no longer used by anything Test Plan: ``` python test/test_quantization.py TestFakeQuantize ``` performance testing was done in the previous PR. Imported from OSS Reviewed By: jerryzh168 Differential Revision: D26090869 fbshipit-source-id: fda042881f77a993a9d15dafabea7cfaf9dc7c9c --- aten/src/ATen/native/native_functions.yaml | 5 +- .../cpu/kernels/QuantizedOpKernels.cpp | 40 ------------- .../quantized/cuda/fake_quantize_core.cu | 50 ---------------- .../ATen/native/quantized/fake_quant_affine.h | 19 ------ .../fake_quant_per_tensor_affine.cpp | 59 +------------------ .../check_backward_compatibility.py | 1 + tools/autograd/derivatives.yaml | 3 - torch/quantization/fake_quantize.py | 16 +---- 8 files changed, 8 insertions(+), 185 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 1fd04b4255b7..0f86bc4d3819 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4637,10 +4637,7 @@ - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor variants: function dispatch: - CPU, CUDA: fake_quantize_per_tensor_affine - -- func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - variants: function + Math: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) variants: function diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 5ed6e28663e0..f4c1b1d572f4 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -2054,43 +2054,6 @@ void q_batch_norm_kernel( } -void fake_quantize_tensor_kernel( - Tensor& output, - const Tensor& input, - float sc, - int64_t z_point, - int64_t quant_min, - int64_t quant_max) { - float inv_scale = 1.0f / sc; - auto iter = TensorIterator::unary_op(output, input); - cpu_kernel(iter, [&](float self) -> float { - return (std::fmin( - std::fmax( - static_cast( - z_point + std::nearbyint(self * inv_scale)), - quant_min), - quant_max) - - z_point) * - sc; - }); -} - -void fake_quantize_grad_tensor_kernel( - Tensor& input_grad, - const Tensor& input, - const Tensor& output_grad, - float sc, - int64_t z_point, - int64_t quant_min, - int64_t quant_max) { - float inv_scale = 1.0f / sc; - auto iter = TensorIterator::binary_op(input_grad, input, output_grad); - cpu_kernel(iter, [&](float x, float dy) -> float { - int64_t xq = static_cast(z_point + std::nearbyint(x * inv_scale)); - return dy * (xq >= quant_min && xq <= quant_max); - }); -} - void fake_quantize_tensor_cachemask_kernel( Tensor& output, Tensor& mask, @@ -3082,10 +3045,7 @@ REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &fake_quantize_learnable_tensor_grad_kernel_cpu); REGISTER_DISPATCH(fake_quant_grad_per_channel_stub, &fake_quant_grad_per_channel_cpu); -REGISTER_DISPATCH(fake_quant_grad_tensor_stub, - &fake_quantize_grad_tensor_kernel); REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu); -REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel); REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel); REGISTER_DISPATCH(qadaptive_avg_pool2d_nhwc_stub, diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu index 87937df546a8..9d74bc9a3400 100644 --- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu +++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu @@ -20,54 +20,6 @@ Returns: */ namespace at { namespace native { -void fake_quantize_tensor_kernel_cuda( - Tensor& output, - const Tensor& input, - float scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max) { - // scalar type of this function is guaranteed to be float - float inv_scale = 1.0f / scale; - auto iter = TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(output) - .add_input(input) - .build(); - gpu_kernel(iter, [=] GPU_LAMBDA(float input_val) -> float { - return (fminf( - quant_max, - fmaxf( - quant_min, - static_cast( - std::nearbyint(input_val * inv_scale) + zero_point))) - - zero_point) * - scale; - }); -} - -void fake_quantize_grad_tensor_kernel_cuda( - Tensor& input_grad, - const Tensor& input, - const Tensor& output_grad, - float scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max) { - // scalar type of this function is guaranteed to be float - float inv_scale = 1.0f / scale; - auto iter = TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(input_grad) - .add_input(output_grad) - .add_input(input) - .build(); - gpu_kernel(iter, [=] GPU_LAMBDA(float dy, float x) -> float { - int64_t Xq = std::nearbyint(x * inv_scale) + zero_point; - return (Xq >= quant_min && Xq <= quant_max) * dy; - }); -} - void fake_quantize_tensor_cachemask_kernel_cuda( Tensor& output, Tensor& mask, @@ -126,9 +78,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda( }); } -REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel_cuda); REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel_cuda); -REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel_cuda); REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &_fake_quantize_grad_learnable_tensor_kernel_cuda); // Fake quantize per channel diff --git a/aten/src/ATen/native/quantized/fake_quant_affine.h b/aten/src/ATen/native/quantized/fake_quant_affine.h index 6865c75f4a49..0215f521eb1f 100644 --- a/aten/src/ATen/native/quantized/fake_quant_affine.h +++ b/aten/src/ATen/native/quantized/fake_quant_affine.h @@ -9,23 +9,6 @@ struct TensorIterator; namespace native { -using fake_quant_tensor_fn = void (*)( - Tensor& output, - const Tensor& input, - float sc, - int64_t z_point, - int64_t quant_min, - int64_t quant_max); - -using fake_quant_grad_tensor_fn = void (*)( - Tensor& input_grad, - const Tensor& input, - const Tensor& output_grad, - float sc, - int64_t z_point, - int64_t quant_min, - int64_t quant_max); - using fake_quant_tensor_cachemask_fn = void (*)( Tensor& output, Tensor& mask, @@ -43,8 +26,6 @@ using fake_quant_learnable_grad_tensor_fn = void (*)( int64_t quant_min, int64_t quant_max); -DECLARE_DISPATCH(fake_quant_tensor_fn, fake_quant_tensor_stub); -DECLARE_DISPATCH(fake_quant_grad_tensor_fn, fake_quant_grad_tensor_stub); DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub); DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub); diff --git a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp index a782033d5002..128db5ebb7b4 100644 --- a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp +++ b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp @@ -10,8 +10,6 @@ namespace at { namespace native { // Use REGISTER_DISPATCH to run CPU and CUDA backend. -DEFINE_DISPATCH(fake_quant_tensor_stub); -DEFINE_DISPATCH(fake_quant_grad_tensor_stub); DEFINE_DISPATCH(fake_quant_tensor_cachemask_stub); DEFINE_DISPATCH(fake_quant_grad_learnable_tensor_stub); @@ -35,60 +33,9 @@ Tensor fake_quantize_per_tensor_affine( int64_t zero_point, int64_t quant_min, int64_t quant_max) { - TORCH_CHECK(self.scalar_type() == ScalarType::Float); - TORCH_CHECK( - quant_min <= quant_max, - "`quant_min` should be less than or \ - equal to `quant_max`."); - TORCH_CHECK( - zero_point >= quant_min && zero_point <= quant_max, - "`zero_point` must be between `quant_min` and `quant_max`."); - - auto Y = at::empty_like(self, self.options(), MemoryFormat::Preserve); - fake_quant_tensor_stub( - self.device().type(), Y, self, scale, zero_point, quant_min, quant_max); - return Y; -} - -/* Backward path to fake-quantize the 'inputs' tensor. - -Args: - dY: Backward input tensor. - X: Forward input tensor. - scale: scale of per tensor affine quantization - zero_point: zero_point of per tensor affine quantization - quant_min: minimum quantized value - quant_max: maximum quantized value - -Returns: - Quantized tensor (double dtype). -*/ - -Tensor fake_quantize_per_tensor_affine_backward( - const Tensor& dY, - const Tensor& X, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max) { - TORCH_CHECK(dY.scalar_type() == ScalarType::Float); - TORCH_CHECK(X.scalar_type() == ScalarType::Float); - TORCH_CHECK(X.numel() == dY.numel(), "`X` and `dY` are not the same size"); - TORCH_CHECK( - quant_min <= quant_max, - "`quant_min` should be less than or \ - equal to `quant_max`."); - TORCH_CHECK( - zero_point >= quant_min && zero_point <= quant_max, - "`zero_point` must be between `quant_min` and `quant_max`."); - if (X.numel() <= 0) { - return X; - } - - auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve); - fake_quant_grad_tensor_stub( - X.device().type(), dX, X, dY, scale, zero_point, quant_min, quant_max); - return dX; + const auto res = at::fake_quantize_per_tensor_affine_cachemask( + self, scale, zero_point, quant_min, quant_max); + return std::get<0>(res); } /* Fake-quantizes the 'inputs' tensor, saving a mask for the backward pass. diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 981f21d5de67..658c61dd4425 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -56,6 +56,7 @@ ("aten::_multinomial_alias_setup", datetime.date(2021, 1, 31)), ("aten::_multinomial_alias_draw", datetime.date(2021, 1, 31)), ("prim::profile_optional", datetime.date(2021, 1, 31)), + ("aten::fake_quantize_per_tensor_affine_backward", datetime.date(2021, 2, 20)), ] def allow_listed(schema, allow_list): diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index a9a751abd260..31136610ed4a 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -458,9 +458,6 @@ - name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) self: zeros_like(grad) -- name: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - self: fake_quantize_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max) - - name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask) diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py index 4d29db46acc4..58d0ad7a055a 100644 --- a/torch/quantization/fake_quantize.py +++ b/torch/quantization/fake_quantize.py @@ -140,19 +140,9 @@ def forward(self, X): X = torch.fake_quantize_per_channel_affine(X, self.scale, self.zero_point, self.ch_axis, self.quant_min, self.quant_max) else: - if self.training: - # During training, use the memory optimized fake_quant - # forward. It has a reduced memory overhead in the backward - # pass compared to fake_quantize_per_tensor_affine. - X, _mask = torch.fake_quantize_per_tensor_affine_cachemask( - X, float(self.scale), int(self.zero_point), - self.quant_min, self.quant_max) - else: - # During inference, use the fastest fake_quant - # which does not compute any extra info for the backward. - X = torch.fake_quantize_per_tensor_affine( - X, float(self.scale), int(self.zero_point), - self.quant_min, self.quant_max) + X = torch.fake_quantize_per_tensor_affine( + X, float(self.scale), int(self.zero_point), + self.quant_min, self.quant_max) return X @torch.jit.export From dfdb1547b9c1934904bfd137b4007d6a46a6f597 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Wed, 27 Jan 2021 19:42:12 -0800 Subject: [PATCH 40/41] Revert D26094906: Add serialization logic for complex numbers Test Plan: revert-hammer Differential Revision: D26094906 (https://github.com/pytorch/pytorch/commit/2de4ecd4ebc99d509b8f13ff12ed241c7433a0ad) Original commit changeset: 7b2614f3ee4a fbshipit-source-id: 6f32a9fc6bb2a904ca1a282bbc6b2df0aee50068 --- test/test_complex.py | 12 ------------ torch/csrc/jit/python/pybind_utils.h | 2 -- torch/csrc/jit/serialization/pickler.cpp | 10 ---------- torch/csrc/jit/serialization/pickler.h | 1 - torch/csrc/jit/serialization/unpickler.cpp | 13 +++---------- 5 files changed, 3 insertions(+), 35 deletions(-) diff --git a/test/test_complex.py b/test/test_complex.py index cc8ed7f62398..e6a705032d74 100644 --- a/test/test_complex.py +++ b/test/test_complex.py @@ -12,18 +12,6 @@ def fn(a: complex): self.checkScript(fn, (3 + 5j,)) - def test_pickle(self): - class ComplexModule(torch.jit.ScriptModule): - def __init__(self): - super().__init__() - self.a = 3 + 5j - - def forward(self, b: int): - return b - - loaded = self.getExportImportCopy(ComplexModule()) - self.assertEqual(loaded.a, 3 + 5j) - class TestComplexTensor(TestCase): @dtypes(*torch.testing.get_all_complex_dtypes()) def test_to_list(self, device, dtype): diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index eca56876999f..06c57f32f15c 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -293,8 +293,6 @@ inline InferredType tryToInferType(py::handle input) { return InferredType(IntType::get()); } else if (py::isinstance(input)) { return InferredType(FloatType::get()); - } else if (PyComplex_CheckExact(input.ptr())) { - return InferredType(ComplexDoubleType::get()); } else if (py::isinstance(input)) { return InferredType(StringType::get()); } else if (THPLayout_Check(input.ptr())) { diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index a0dd826c4267..e2118af2019d 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -49,8 +49,6 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { pushTuple(ivalue); } else if (ivalue.isDouble()) { pushDouble(ivalue.toDouble()); - } else if (ivalue.isComplexDouble()) { - pushComplexDouble(ivalue); } else if (ivalue.isInt()) { pushInt(ivalue.toInt()); } else if (ivalue.isBool()) { @@ -466,14 +464,6 @@ void Pickler::pushDouble(double value) { // Python pickle format is big endian, swap. push(swapDouble(value)); } -void Pickler::pushComplexDouble(const IValue& value) { - c10::complex d = value.toComplexDouble(); - pushGlobal("builtins", "complex"); - pushIValue(d.real()); - pushIValue(d.imag()); - push(PickleOpCode::TUPLE2); - push(PickleOpCode::REDUCE); -} void Pickler::pushLong(const std::string& data) { uint64_t size = data.size(); diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index 4dc216ec702b..21d0f61a18eb 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -160,7 +160,6 @@ class TORCH_API Pickler { void endTypeTag(const IValue& value); void pushBool(bool value); void pushDouble(double value); - void pushComplexDouble(const IValue& value); void pushGenericList(const IValue& ivalue); void pushIntList(const IValue& ivalue); void pushList(const IValue& ivalue); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index efeaac75c41c..f363fe73f1e9 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -57,7 +57,6 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) { case StorageType::Kind: case NumberType::Kind: case FloatType::Kind: - case ComplexDoubleType::Kind: case IntType::Kind: case NoneType::Kind: case GeneratorType::Kind: @@ -81,6 +80,9 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) { case AnyEnumType::Kind: // no op, there is nothing to tag break; + // TODO(@anjali411): Implement serialization/deserialization for complex + // numbers + case ComplexDoubleType::Kind: case EnumType::Kind: // TODO(gmagogsfm): Implement serialization/deserialization of Enum. AT_ASSERT(false); @@ -541,15 +543,6 @@ void Unpickler::readGlobal( // Unpickle a tensor bool quantized = class_name == "_rebuild_qtensor"; rebuildTensor(quantized); - } else if (module_name == "builtins" && class_name == "complex") { - globals_.emplace_back([this] { - auto elems = pop(stack_).toTuple()->elements(); - AT_ASSERT(elems.size() == 2); - auto complex = - c10::complex(elems.at(0).toDouble(), elems.at(1).toDouble()); - stack_.emplace_back(complex); - }); - } else if (module_name == "collections" && class_name == "OrderedDict") { // collections.OrderedDict is used in tensor serialization for a tensor's // backward hooks (but they are not actually saved with this Pickler) From 12a434abbc322b6e1103c7fb0c0afb35447dafb7 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Wed, 27 Jan 2021 19:49:03 -0800 Subject: [PATCH 41/41] Revert D26077905: Back out "Revert D25850783: Add torch::deploy, an embedded torch-python interpreter" Test Plan: revert-hammer Differential Revision: D26077905 (https://github.com/pytorch/pytorch/commit/dc2a44c4fc5f5efba16b8567ab970f8bcf1fe007) Original commit changeset: fae83bf9822d fbshipit-source-id: b70185916502ba9ebe16d781cf0659b9f7865c9a --- .github/workflows/lint.yml | 6 - .gitignore | 3 - .jenkins/pytorch/build.sh | 11 - .jenkins/pytorch/test.sh | 8 - CMakeLists.txt | 5 - torch/__init__.py | 10 +- torch/_ops.py | 3 +- torch/_utils_internal.py | 16 +- torch/csrc/Module.cpp | 2 - torch/csrc/deploy/.gitignore | 1 - torch/csrc/deploy/CMakeLists.txt | 3 - torch/csrc/deploy/README.md | 10 - torch/csrc/deploy/example/simple.pt | Bin 2432 -> 0 bytes torch/csrc/deploy/example/trace_simple.py | 20 -- torch/csrc/deploy/interpreter/CMakeLists.txt | 115 ------- .../deploy/interpreter/CMakePythonModules.txt | 69 ---- torch/csrc/deploy/interpreter/freeze.py | 269 --------------- .../deploy/interpreter/hide_symbols.script | 5 - torch/csrc/deploy/interpreter/interpreter.cpp | 324 ------------------ torch/csrc/deploy/interpreter/interpreter.h | 67 ---- .../deploy/interpreter/interpreter_impl.h | 26 -- torch/csrc/deploy/interpreter/test_main.cpp | 49 --- .../deploy/interpreter/third_party/README.md | 2 - torch/cuda/__init__.py | 4 - torch/utils/__init__.py | 8 +- 25 files changed, 12 insertions(+), 1024 deletions(-) delete mode 100644 torch/csrc/deploy/.gitignore delete mode 100644 torch/csrc/deploy/CMakeLists.txt delete mode 100644 torch/csrc/deploy/README.md delete mode 100644 torch/csrc/deploy/example/simple.pt delete mode 100644 torch/csrc/deploy/example/trace_simple.py delete mode 100644 torch/csrc/deploy/interpreter/CMakeLists.txt delete mode 100644 torch/csrc/deploy/interpreter/CMakePythonModules.txt delete mode 100644 torch/csrc/deploy/interpreter/freeze.py delete mode 100644 torch/csrc/deploy/interpreter/hide_symbols.script delete mode 100644 torch/csrc/deploy/interpreter/interpreter.cpp delete mode 100644 torch/csrc/deploy/interpreter/interpreter.h delete mode 100644 torch/csrc/deploy/interpreter/interpreter_impl.h delete mode 100644 torch/csrc/deploy/interpreter/test_main.cpp delete mode 100644 torch/csrc/deploy/interpreter/third_party/README.md diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 9c215540108b..54acbe7b1c6a 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -170,8 +170,6 @@ jobs: # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed # in a follow up PR. # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built. - # deploy/interpreter files are excluded due to using macros and other techniquies - # that are not easily converted to accepted c++ python tools/clang_tidy.py \ --verbose \ --paths torch/csrc/ \ @@ -188,10 +186,6 @@ jobs: -g"-torch/csrc/autograd/FunctionsManual.cpp" \ -g"-torch/csrc/generic/*.cpp" \ -g"-torch/csrc/jit/codegen/cuda/runtime/*" \ - -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \ - -g"-torch/csrc/deploy/interpreter/interpreter.h" \ - -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \ - -g"-torch/csrc/deploy/interpreter/test_main.cpp" \ "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt diff --git a/.gitignore b/.gitignore index a3a832ce7555..e1fe94cb9bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -66,9 +66,6 @@ torch/csrc/autograd/generated/* torch/testing/_internal/generated/annotated_fn_args.py torch/testing/_internal/data/*.pt torch/csrc/cudnn/cuDNN.cpp -torch/csrc/deploy/interpreter/cpython -torch/csrc/deploy/interpreter/frozen -torch/csrc/deploy/interpreter/third_party/typing_extensions.py torch/csrc/generated torch/csrc/generic/TensorMethods.cpp torch/csrc/jit/generated/* diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index dfd359c1ddf4..fad9c8e49e64 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -23,17 +23,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@" fi -if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then - # Enabling DEPLOY build (embedded torch python interpreter, experimental) - # only on one config for now, can expand later - export USE_DEPLOY=ON - - # Deploy feature builds cpython. It requires these packages. - # TODO move this to dockerfile? - sudo apt-get -qq update - sudo apt-get -qq install libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev -fi - echo "Python version:" python --version diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index d70a377ec086..73563f145eb8 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -354,11 +354,6 @@ test_vec256() { fi } -test_torch_deploy() { - SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test - assert_git_not_dirty -} - if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -376,9 +371,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then - if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then - test_torch_deploy - fi install_torchvision test_python_shard1 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index c138f261c27b..a23208752afb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -919,8 +919,3 @@ endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() - -# ---[ Torch Deploy -if(USE_DEPLOY) - add_subdirectory(torch/csrc/deploy) -endif() diff --git a/torch/__init__.py b/torch/__init__.py index f27af91eb493..3f9df8bc009a 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -22,11 +22,7 @@ from ._utils import _import_dotted_name from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \ USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS -# TODO(torch_deploy) figure out how to freeze version.py in fbcode build -if sys.executable == 'torch_deploy': - __version__ = "torch-deploy-1.8" -else: - from .version import __version__ +from .version import __version__ from ._six import string_classes as _string_classes from typing import Set, Type, TYPE_CHECKING @@ -138,7 +134,7 @@ # See Note [Global dependencies] def _load_global_deps(): - if platform.system() == 'Windows' or sys.executable == 'torch_deploy': + if platform.system() == 'Windows': return lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so') @@ -520,7 +516,7 @@ class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase): ################################################################################ def manager_path(): - if platform.system() == 'Windows' or sys.executable == 'torch_deploy': + if platform.system() == 'Windows': return b"" path = get_file_path('torch', 'bin', 'torch_shm_manager') prepare_multiprocessing_environment(get_file_path('torch')) diff --git a/torch/_ops.py b/torch/_ops.py index 96c8baac7838..dd0c8cd19fde 100644 --- a/torch/_ops.py +++ b/torch/_ops.py @@ -2,6 +2,7 @@ import contextlib import ctypes +import os import sys import types @@ -66,7 +67,7 @@ def __getattr__(self, op_name): return op class _Ops(types.ModuleType): - __file__ = '_ops.py' + __file__ = os.path.join(os.path.dirname(__file__), '_ops.py') def __init__(self): super(_Ops, self).__init__('torch.ops') diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py index c77e960ae659..be7d8fcaa685 100644 --- a/torch/_utils_internal.py +++ b/torch/_utils_internal.py @@ -1,7 +1,6 @@ import os import inspect -import sys import tempfile # this arbitrary-looking assortment of functionality is provided here @@ -9,16 +8,11 @@ # use is the FB build environment, where this source file is replaced # by an equivalent. -if sys.executable == 'torch_deploy': - # __file__ is meaningless in the context of frozen torch used in torch deploy. - # setting empty torch_parent should allow below functions to operate without crashing, - # but it's unclear if there is a valid use case for them in the context of deploy. - torch_parent = "" +if os.path.basename(os.path.dirname(__file__)) == 'shared': + torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) else: - if os.path.basename(os.path.dirname(__file__)) == 'shared': - torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) - else: - torch_parent = os.path.dirname(os.path.dirname(__file__)) + torch_parent = os.path.dirname(os.path.dirname(__file__)) + def get_file_path(*path_components): return os.path.join(torch_parent, *path_components) @@ -66,7 +60,7 @@ def get_source_lines_and_file(obj, error_msg=None): TEST_MASTER_ADDR = '127.0.0.1' TEST_MASTER_PORT = 29500 -# USE_GLOBAL_DEPS controls whether __init__.py tries to load +# USE_GLOBAL_DEPS controls whether __init__.py tries to load # libtorch_global_deps, see Note [Global dependencies] USE_GLOBAL_DEPS = True # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index bbd3ccef505f..eb80fff32b81 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -692,8 +692,6 @@ extern "C" #ifdef _WIN32 __declspec(dllexport) #endif -TORCH_API PyObject* initModule(); -// separate decl and defn for msvc error C2491 PyObject* initModule() { HANDLE_TH_ERRORS at::internal::lazy_init_num_threads(); diff --git a/torch/csrc/deploy/.gitignore b/torch/csrc/deploy/.gitignore deleted file mode 100644 index aa484a97a20f..000000000000 --- a/torch/csrc/deploy/.gitignore +++ /dev/null @@ -1 +0,0 @@ -example/generated/* diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt deleted file mode 100644 index 9da314905860..000000000000 --- a/torch/csrc/deploy/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}") - -add_subdirectory(interpreter) diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md deleted file mode 100644 index 4fab5aa4ef56..000000000000 --- a/torch/csrc/deploy/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Torch Deploy -This is an experimental feature to embed multiple python interpreters inside the torch library, -providing a solution to the 'GIL problem' for multithreading with the convenience of python -and eager or torchscripted pytorch programs. - -# libinterpreter -This is an internal library used behind the scenes to enable multiple python interpreters in -a single deploy runtime. libinterpreter.so is DLOPENed multiple times by the deploy library. -Each copy of libinterpreter exposes a simple interpreter interface but hides its python and other -internal symbols, preventing the different python instances from seeing each other. diff --git a/torch/csrc/deploy/example/simple.pt b/torch/csrc/deploy/example/simple.pt deleted file mode 100644 index 50f9a087aa822821647a8acabe28bb84207475b2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2432 zcmah~4Ny~87JmE%jN!K_C{c`n7DSRLExN!vhoWYYRE&s2)-FvT36Ky7`GEqhO3~?p zL|_?ZsctLdMyZHOE5mjH5kaV>s1(uBHlpj!S}e8F-KAEovhVR{g6sCo+_~?)d%knu z`R@776^k4k3BtvNc+OmjmkEO^D@UW`D^iRpe1R~=lh``jhIZRyK=*I)!cZZq8){(n z@pmC?B^ynOG?YYeu>YNUa`b@-j{Wg<^dcXFL{|=d_n*LpxJB4{DiZ_0cLHYUD8#+K z1H*4J@SRPF(w;cn*y4?rHxI+^Z-0Txury4*pNcYmEbge-j72SNa8-N+niN~G_qF}N z<@_7^%N(&n#=@$r$*7cjV)N&FVM)6^KJ=2I;OA!2;OT%JqWUNghic$ze6Z|;`Ixk4 zK3-5SLHR&3X6&{=+nIhCDBc5>@l4=GrIPiSf`i2ftn~r#S$8M-R@H4N@;w^W9&!18t;$kz&B73@dhIeC+LVs` zlLq9;xM(`F6k0QwC>~BDzdtq({_3B=an%TTuO5a^Bl;k4EEHMiO5sOy27bIifQgN2 zB-}WWA20bC8nwOTsGTFcx{C#kmHi}KbDNxWcOnTn)#y)>Gs-O0PW1JTh zx_jV>{nvnD3B?xEILwnAALMW+lz(s#$;PQ1cyw7?Ix5xsJ`Ii&$ zMoAm|aArNmfEhSP6EJ$c5-*z~F>S0H@&#F7Svm^Qi5Wn6X(N+=iojq=D4xeZ!0GNS z_)UBu4zBgWo|~V*DY*)xx09rY-*HlLX#)}-b5IW_z$M%b<64iwf$(M6o6`(+8@-=5os3(8l*-#T1TEOK_~{^#dUxef$D5Ah52=1v4y>y4L_ zsS(C3dR}q13M{ghOQlAwUY;S9GB*@#&?<5@N~Vqoes5clgh7e=ZKWzb!>F^9PdM?U zxki=7z?ABhskthRLTXfI8?<_9-b$vgghr##}J)2Zqhl6h#(PD#5M+r*djY)iCnJLXq0lJN}Fw9ZqzH3dZl8m zN^WH8T!Isf3A#BFy5|`6DXMH$cDjyPlAEfV%cJS)jB2R3M6J<-ri)Frr|0|AJR)S8 zF1c2ryq!>>$|8?x(K!lPgZp|mzPV4UDpD?SS2)b9a*EAyjQ>UMh zppHCUI#E_GGxU(y`Uv~?R zH22tDTQeU2<}#k8qp-i&eDsqR-y4#3*9XQpA0E7W)HCUK1BpX}Y@ZMJ|J~a?yR+=^ z3hfWqUuo3J@RDhN-3~f$-}Idi{!HaOFqQZJzN^@T|FgWa4aSsgqhV4&elwX)pSbN@ zWFI_HbXoj4Nz_G1(Bs7#bt9su_sLW04ODFlW1@a#2iyH-zlAaBEWi3FeTzM{Gd;v2 zJ9?VAldZdlE4^T2HaGNasI`*ZZD2W+c&(r`SCimpVzaKec{bpK6iHLy=cG3~iuL*V z+eB${dU2FzwK_W`>vLk;#A~3s-OOKE)xhis)?B;UL|{|hZz{sn>54@R#_}m2KZnp6 pZ@RJ4Ha(l@o=900-9IU#a1&2{b0**EWwFR/configure --prefix ${PYTHON_INSTALL_DIR} - BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8 - INSTALL_COMMAND make install - BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN} - LOG_OUTPUT_ON_FAILURE True -) - -# We find the built python modules, this is confusing because python build already outputs -# the modules in a strange nested path, and then that path is relative to the -# Cmake ExternalProject root in the cmake build dir. -ExternalProject_Get_property(cpython SOURCE_DIR) -SET(PYTHON_MODULE_DIR "${SOURCE_DIR}/build/temp.linux-x86_64-3.8/${SOURCE_DIR}/Modules") -SET(PYTHON_STDLIB_DIR "${SOURCE_DIR}/Lib") -SET(PYTHON_STDLIB "${PYTHON_INSTALL_DIR}/lib/libpython_stdlib3.8.a") -# Then we use a hardcoded list of expected module names and include them in our lib -include("CMakePythonModules.txt") -ExternalProject_Add_Step( - cpython - archive_stdlib - DEPENDEES install - BYPRODUCTS ${PYTHON_STDLIB} - COMMAND ar -rc ${PYTHON_STDLIB} ${PYTHON_MODULES} - VERBATIM -) -# Get python typing extension, needed by torch -SET(TYPING_PKG "${INTERPRETER_DIR}/third_party/typing_extensions.py") -ExternalProject_Add( - typing - PREFIX typing - GIT_REPOSITORY https://github.com/python/typing.git - GIT_TAG 3.7.4.3 - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND cp ../typing/typing_extensions/src_py3/typing_extensions.py ${TYPING_PKG} - BYPRODUCTS ${TYPING_PKG} - LOG_OUTPUT_ON_FAILURE True -) - -# Output files generated by freeze script, containing frozen bytecode -SET(FROZEN_DIR "${INTERPRETER_DIR}/frozen") -set(FROZEN_FILES - ${FROZEN_DIR}/main.c - ${FROZEN_DIR}/bytecode_0.c - ${FROZEN_DIR}/bytecode_1.c - ${FROZEN_DIR}/bytecode_2.c - ${FROZEN_DIR}/bytecode_3.c - ${FROZEN_DIR}/bytecode_4.c -) -# Packages to freeze: python stdlib, typing extension, and torch -add_custom_command( - OUTPUT ${FROZEN_FILES} - WORKING_DIRECTORY ${INTERPRETER_DIR} - COMMAND mkdir -p ${FROZEN_DIR} - COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose - DEPENDS cpython typing - VERBATIM -) - -# instantiate a library based on the objects that make up torch_python -# make sure system python isn't used here -target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR}) -add_library(torch_python_static STATIC $) -# Build the interpreter lib, designed to be standalone and dlopened -# We bake the python and torch_python binding objs into libinterpreter -set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script") -set(INTERPRETER_LIB_SOURCES - ${INTERPRETER_DIR}/interpreter.cpp - ${FROZEN_FILES} - ${LINKER_SCRIPT} -) -add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT}) -set_property(TARGET interpreter APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}") -# need to ensure headers are present before any .cpp in interpreter are compiled, -# but cpp themselves don't clearly depend on cpython so there is a race otherwise -add_dependencies(interpreter cpython) -target_compile_options( - interpreter PRIVATE - -fvisibility=hidden -) -target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR}) -target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR}) -target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static) -target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins -target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite) - -# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib -set(INTERPRETER_TEST_SOURCES - ${INTERPRETER_DIR}/test_main.cpp -) -add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES}) -target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch) -target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR}) -target_link_libraries(interpreter_test PUBLIC gtest dl) -# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen -# dependencies for libinterpreter, regardless of whether they are used in interpreter_test -target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite) diff --git a/torch/csrc/deploy/interpreter/CMakePythonModules.txt b/torch/csrc/deploy/interpreter/CMakePythonModules.txt deleted file mode 100644 index c6bc9cab76ff..000000000000 --- a/torch/csrc/deploy/interpreter/CMakePythonModules.txt +++ /dev/null @@ -1,69 +0,0 @@ -SET(PYTHON_MODULES - ${PYTHON_MODULE_DIR}/arraymodule.o - ${PYTHON_MODULE_DIR}/_asynciomodule.o - ${PYTHON_MODULE_DIR}/audioop.o - ${PYTHON_MODULE_DIR}/binascii.o - ${PYTHON_MODULE_DIR}/_bisectmodule.o - ${PYTHON_MODULE_DIR}/_blake2/blake2module.o ${PYTHON_MODULE_DIR}/_blake2/blake2b_impl.o ${PYTHON_MODULE_DIR}/_blake2/blake2s_impl.o - ${PYTHON_MODULE_DIR}/_bz2module.o - ${PYTHON_MODULE_DIR}/cmathmodule.o - # ${PYTHON_MODULE_DIR}/_math.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_cn.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_hk.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_iso2022.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_jp.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_kr.o - ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_tw.o - ${PYTHON_MODULE_DIR}/_contextvarsmodule.o - ${PYTHON_MODULE_DIR}/_cryptmodule.o - ${PYTHON_MODULE_DIR}/_csv.o - ${PYTHON_MODULE_DIR}/_ctypes/_ctypes.o ${PYTHON_MODULE_DIR}/_ctypes/callbacks.o ${PYTHON_MODULE_DIR}/_ctypes/callproc.o ${PYTHON_MODULE_DIR}/_ctypes/stgdict.o ${PYTHON_MODULE_DIR}/_ctypes/cfield.o - ${PYTHON_MODULE_DIR}/_ctypes/_ctypes_test.o - ${PYTHON_MODULE_DIR}/_cursesmodule.o - ${PYTHON_MODULE_DIR}/_curses_panel.o - ${PYTHON_MODULE_DIR}/_datetimemodule.o - ${PYTHON_MODULE_DIR}/_decimal/_decimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/basearith.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/constants.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/context.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/convolute.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/crt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/difradix2.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fnt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fourstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/io.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/memory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/mpdecimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/numbertheory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/sixstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/transpose.o - ${PYTHON_MODULE_DIR}/_elementtree.o - ${PYTHON_MODULE_DIR}/fcntlmodule.o - ${PYTHON_MODULE_DIR}/grpmodule.o - ${PYTHON_MODULE_DIR}/_hashopenssl.o - ${PYTHON_MODULE_DIR}/_heapqmodule.o - ${PYTHON_MODULE_DIR}/_json.o - ${PYTHON_MODULE_DIR}/_lsprof.o - ${PYTHON_MODULE_DIR}/_lzmamodule.o - ${PYTHON_MODULE_DIR}/mathmodule.o - ${PYTHON_MODULE_DIR}/md5module.o - ${PYTHON_MODULE_DIR}/mmapmodule.o - ${PYTHON_MODULE_DIR}/cjkcodecs/multibytecodec.o - ${PYTHON_MODULE_DIR}/_multiprocessing/multiprocessing.o ${PYTHON_MODULE_DIR}/_multiprocessing/semaphore.o - ${PYTHON_MODULE_DIR}/nismodule.o - ${PYTHON_MODULE_DIR}/_opcode.o - ${PYTHON_MODULE_DIR}/ossaudiodev.o - ${PYTHON_MODULE_DIR}/parsermodule.o - ${PYTHON_MODULE_DIR}/_pickle.o - ${PYTHON_MODULE_DIR}/_posixsubprocess.o - ${PYTHON_MODULE_DIR}/pyexpat.o ${PYTHON_MODULE_DIR}/expat/xmlparse.o ${PYTHON_MODULE_DIR}/expat/xmlrole.o ${PYTHON_MODULE_DIR}/expat/xmltok.o - ${PYTHON_MODULE_DIR}/_queuemodule.o - ${PYTHON_MODULE_DIR}/_randommodule.o - ${PYTHON_MODULE_DIR}/readline.o - ${PYTHON_MODULE_DIR}/resource.o - ${PYTHON_MODULE_DIR}/selectmodule.o - ${PYTHON_MODULE_DIR}/sha1module.o - ${PYTHON_MODULE_DIR}/sha256module.o - ${PYTHON_MODULE_DIR}/_sha3/sha3module.o - ${PYTHON_MODULE_DIR}/sha512module.o - ${PYTHON_MODULE_DIR}/socketmodule.o - ${PYTHON_MODULE_DIR}/spwdmodule.o - ${PYTHON_MODULE_DIR}/_ssl.o - ${PYTHON_MODULE_DIR}/_struct.o - ${PYTHON_MODULE_DIR}/syslogmodule.o - ${PYTHON_MODULE_DIR}/termios.o - ${PYTHON_MODULE_DIR}/_testbuffer.o - ${PYTHON_MODULE_DIR}/_testcapimodule.o - ${PYTHON_MODULE_DIR}/_testimportmultiple.o - ${PYTHON_MODULE_DIR}/_testmultiphase.o - ${PYTHON_MODULE_DIR}/unicodedata.o - ${PYTHON_MODULE_DIR}/xxlimited.o - ${PYTHON_MODULE_DIR}/_xxtestfuzz/_xxtestfuzz.o ${PYTHON_MODULE_DIR}/_xxtestfuzz/fuzzer.o - ${PYTHON_MODULE_DIR}/zlibmodule.o -) diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py deleted file mode 100644 index 459b7be9381c..000000000000 --- a/torch/csrc/deploy/interpreter/freeze.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Freeze Python packages. - -Freezing makes it possible to ship arbitrary Python modules as part of a C++ -library. The Python source of the module is compiled to bytecode and written -to `.c` files, to be imported by Python's built-in FrozenImporter. - -In a normal Python installation, FrozenImporter is only used to bootstrap the -initialization of the import machinery. Python's importers are defined in -Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be -retrieved before any importers are available. Freezing the module bytecode -resolves this circular dependency. - -This script will freeze the Python standard library. It produces two things: -- Bytecode files: A set of `.c` that define C variables containing Python bytecode. -- Main file: A `main.c` file listing all of these modules in the right form to be - consumed by FrozenImporter. - -The library that wishes to these modules make them available to the local -Python instance by extending `PyImport_FrozenModules` appropriately (see -https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules). -""" - -import argparse -import functools -import itertools -import marshal -import os -from dataclasses import dataclass -from pathlib import Path -from typing import List - - -MAIN_INCLUDES = """#include - -""" - -MAIN_PREFIX = """ -// Compiled standard library modules. These should be appended to the existing -// `PyImport_FrozenModules` that ships with CPython. -struct _frozen _PyImport_FrozenModules_torch[] = { -""" - -FAKE_PREFIX = """ -// Compiled standard library modules. These should be appended to the existing -// `PyImport_FrozenModules` that ships with CPython. -struct _frozen _PyImport_FrozenModules[] = { -""" - -MAIN_SUFFIX = """\ - {0, 0, 0} /* sentinel */ -}; -""" - -# Exclude some standard library modules to: -# 1. Slim down the final frozen lib. -# 2. Remove functionality we don't want to support. -DENY_LIST = [ - # Interface to unix databases - "dbm", - # ncurses bindings (terminal interfaces) - "curses", - # Tcl/Tk GUI - "tkinter", - "tkinter", - # Tests for the standard library - "test", - "tests", - "idle_test", - "__phello__.foo.py", - # importlib frozen modules. These are already baked into CPython. - "_bootstrap.py", - "_bootstrap_external.py", -] - -NUM_BYTECODE_FILES = 5 - - -def indent_msg(fn): - @functools.wraps(fn) - def wrapper(*args, **kwargs): - args[0].indent += 1 - ret = fn(*args, **kwargs) - args[0].indent -= 1 - return ret - - return wrapper - - -@dataclass -class FrozenModule: - # The fully qualified module name, e.g. 'foo.bar.baz' - module_name: str - # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz' - c_name: str - # The size of the C variable. Negative if this module is a package. - size: int - # The frozen bytecode - bytecode: bytes - - -class Freezer: - def __init__(self, verbose: bool): - self.frozen_modules: List[FrozenModule] = [] - self.indent: int = 0 - self.verbose: bool = verbose - - def msg(self, path: Path, code: str): - if not self.verbose: - return - # P: package dir - # F: python file - # S: skipped (not a package dir) - # X: skipped (deny-listed) - # N: skipped (not a python file) - for i in range(self.indent): - print(" ", end="") - print(f"{code} {path}") - - def write_bytecode(self, install_root): - """ - Write the `.c` files containing the frozen bytecode. Shard frozen - modules evenly across the files. - """ - bytecode_file_names = [ - f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES) - ] - bytecode_files = [open(os.path.join(install_root, name), "w") for name in bytecode_file_names] - it = itertools.cycle(bytecode_files) - for m in self.frozen_modules: - self.write_frozen(m, next(it)) - - for f in bytecode_files: - f.close() - - def write_main(self, install_root, oss): - """ - Write the `main.c` file containing a table enumerating all the - frozen modules. - """ - with open(os.path.join(install_root, "main.c"), "w") as outfp: - outfp.write(MAIN_INCLUDES) - for m in self.frozen_modules: - outfp.write(f"extern unsigned char {m.c_name}[];\n") - - outfp.write(MAIN_PREFIX) - for m in self.frozen_modules: - outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n') - outfp.write(MAIN_SUFFIX) - if oss: - outfp.write(FAKE_PREFIX) - outfp.write(MAIN_SUFFIX) - - def write_frozen(self, m: FrozenModule, outfp): - """ - Write a single frozen module's bytecode out to a C variable. - """ - outfp.write(f"unsigned char {m.c_name}[] = {{") - for i in range(0, len(m.bytecode), 16): - outfp.write("\n\t") - for c in bytes(m.bytecode[i : i + 16]): - outfp.write("%d," % c) - outfp.write("\n};\n") - - def compile_path(self, path: Path, top_package_path: Path): - """Generic entry point for compiling a Path object.""" - if path.is_dir(): - self.compile_package(path, top_package_path) - else: - self.compile_file(path, top_package_path) - - @indent_msg - def compile_package(self, path: Path, top_package_path: Path): - """Compile all the files within a Python package dir.""" - assert path.is_dir() - if path.name in DENY_LIST: - self.msg(path, "X") - return - - # Python packages are directories that have __init__.py in them. - is_package_dir = any([child.name == "__init__.py" for child in path.iterdir()]) - if not is_package_dir: - self.msg(path, "S") - return - - self.msg(path, "P") - # Recursively compile all children in this dir - for child in path.iterdir(): - self.compile_path(child, top_package_path) - - def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]: - # `path` looks like 'Lib/foo/bar/baz.py' - - # chop off 'Lib/' to get something that represents a Python module hierarchy. - # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz' - normalized_path = file_path.relative_to(top_package_path.parent) - - if normalized_path.name == "__init__.py": - # Special handling for `__init__.py`. In this case, this file - # specifies that the containing directory should be treated as a package. - # For 'foo/bar/baz/__init__.py': - # - The module name is 'baz' - module_basename = normalized_path.parent.name - # - The parent is foo.bar (need to shave off the 'baz') - module_parent = normalized_path.parent.parent.parts - else: - module_basename = normalized_path.stem - module_parent = normalized_path.parent.parts - return list(module_parent) + [module_basename] - - @indent_msg - def compile_file(self, path: Path, top_package_path: Path): - """ - Compile a Python source file to frozen bytecode. Append the result to - `self.frozen_modules`. - """ - assert path.is_file() - if path.suffix != ".py": - self.msg(path, "N") - return - - if path.name in DENY_LIST: - self.msg(path, "X") - return - - self.msg(path, "F") - module_qualname = self.get_module_qualname(path, top_package_path) - module_mangled_name = "__".join(module_qualname) - c_name = "M_" + module_mangled_name - - with open(path, "r") as src_file: - co = compile(src_file.read(), path, "exec") - - bytecode = marshal.dumps(co) - size = len(bytecode) - if path.name == '__init__.py': - # Python packages are signified by negative size. - size = -size - self.frozen_modules.append( - FrozenModule(".".join(module_qualname), c_name, size, bytecode) - ) - - -parser = argparse.ArgumentParser(description="Compile py source") -parser.add_argument("paths", nargs="*", help="Paths to freeze.") -parser.add_argument("--verbose", action="store_true", help="Print debug logs") -parser.add_argument("--install_dir", help="Root directory for all output files") -parser.add_argument("--fbcode_dir", help="Root directory for all output files") -parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules") - -args = parser.parse_args() - -f = Freezer(args.verbose) - -for p in args.paths: - if args.fbcode_dir: - p = os.path.join(args.fbcode_dir, p) - path = Path(p) - if path.is_dir() and not Path.exists(path / '__init__.py'): - # this 'top level path p' is a standard directory containing modules, - # not a module itself - # each 'mod' could be a dir containing __init__.py or .py file - for mod in path.glob("*"): - f.compile_path(mod, mod) - else: - f.compile_path(path, path) - -f.write_bytecode(args.install_dir) -f.write_main(args.install_dir, args.oss) diff --git a/torch/csrc/deploy/interpreter/hide_symbols.script b/torch/csrc/deploy/interpreter/hide_symbols.script deleted file mode 100644 index c748c8bfec95..000000000000 --- a/torch/csrc/deploy/interpreter/hide_symbols.script +++ /dev/null @@ -1,5 +0,0 @@ -INTERPRETER_0.1 { - global: - initialize_interface; - local: *; # hide everything else -}; diff --git a/torch/csrc/deploy/interpreter/interpreter.cpp b/torch/csrc/deploy/interpreter/interpreter.cpp deleted file mode 100644 index 7d685d33435c..000000000000 --- a/torch/csrc/deploy/interpreter/interpreter.cpp +++ /dev/null @@ -1,324 +0,0 @@ -#include - -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; -using namespace py::literals; - -// TODO this should come from cmake -#define DEBUG 0 -template -const auto PYOBJ_ASSERT(T obj) { -#if (DEBUG == 1) - if (NULL == obj) { - PyErr_Print(); - } -#endif - TORCH_INTERNAL_ASSERT(NULL != obj); -} - -static wchar_t* program; - -#define FOREACH_LIBRARY(_) \ - _(array) \ - _(_asyncio) \ - _(audioop) \ - _(binascii) \ - _(_bisect) \ - _(_blake2) \ - _(_bz2) \ - _(cmath) \ - _(_codecs_cn) \ - _(_codecs_hk) \ - _(_codecs_iso2022) \ - _(_codecs_jp) \ - _(_codecs_kr) \ - _(_codecs_tw) \ - _(_contextvars) \ - _(_crypt) \ - _(_csv) \ - _(_ctypes) \ - _(_ctypes_test) \ - _(_curses) \ - _(_curses_panel) \ - _(_datetime) \ - _(_decimal) \ - _(_elementtree) \ - _(fcntl) \ - _(grp) \ - _(_hashlib) \ - _(_heapq) \ - _(_json) \ - _(_lsprof) \ - _(_lzma) \ - _(math) \ - _(_md5) \ - _(mmap) \ - _(_multibytecodec) \ - _(_multiprocessing) \ - _(nis) \ - _(_opcode) \ - _(ossaudiodev) \ - _(parser) \ - _(_pickle) \ - _(_posixsubprocess) \ - _(pyexpat) \ - _(_queue) \ - _(_random) \ - _(readline) \ - _(resource) \ - _(select) \ - _(_sha1) \ - _(_sha256) \ - _(_sha3) \ - _(_sha512) \ - _(_socket) \ - _(spwd) \ - _(_ssl) \ - _(_struct) \ - _(syslog) \ - _(termios) \ - _(_testbuffer) \ - _(_testcapi) \ - _(_testimportmultiple) \ - _(_testmultiphase) \ - _(unicodedata) \ - _(xxlimited) \ - _(_xxtestfuzz) \ - _(zlib) - -#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void); -FOREACH_LIBRARY(DECLARE_LIBRARY_INIT) -#undef DECLARE_LIBRARY_INIT - -extern "C" __attribute__((visibility("default"))) void initialize_interface( - InterpreterImpl* s) { -#define INITIALIZE_MEMBER(func) s->func = func; - FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER) -#undef INITIALIZE_MEMBER -} - -// These numbers of modules should not change as long as the cpython version -// embedded in the build remains fixed -static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6; -static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680; - -// We need to preserve the existing FrozenModules list, since it includes -// important importlib machinery. This code is adapted from the similar -// `PyImport_ExtendInittab`. -int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) { - struct _frozen *p = nullptr; - size_t a = 0, b = 0, c = 0; - int res = 0; - - /* Count the number of entries in both tables */ - for (a = 0; frozenpython[a].name != nullptr; a++) { - // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl; - } - for (b = 0; frozentorch[b].name != nullptr; b++) { - // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl; - } - for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) { - // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl; - } - - // Num frozen builtins shouldn't change (unless modifying the underlying cpython version) - TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules"); - // Check a+b together since in OSS a is empty and b contains stdlib+torch, while - // in fbcode they are separated due to thirdparty2 frozenpython. - // No fixed number of torch modules to check for, but there should be at least one. - TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules"); - - /* Allocate new memory for the combined table */ - if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) { - size_t size = sizeof(struct _frozen) * (a + b + c + 1); - p = (_frozen*)PyMem_Realloc(p, size); - } - if (p == nullptr) { - return -1; - } - - /* Copy the tables into the new memory */ - memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen)); - memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen)); - memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen)); - PyImport_FrozenModules = p; - return res; -} - -// We need to register a custom finder because we are registering `torch._C` as -// a built-in module, and it will otherwise get skipped by the default importer. -const char* finder = R"RAW( -import sys -# Remove the path-based importer, as we don't want our isolated interpreter to read the file system -sys.meta_path = sys.meta_path[:-1] - -class F: - def find_spec(self, fullname, path, target=None): - if fullname == 'torch._C': - return sys.meta_path[1].find_spec('torch._C', None, None) - return None -sys.meta_path.insert(0, F()) - -# make loader importable -)RAW"; - -const char* sysprint = R"RAW( -import sys -print("exec_prefix:", sys.base_exec_prefix) -print("_base_executable:", sys._base_executable) -print("base_prefix:", sys.base_prefix) -print("exec_prefix:", sys.exec_prefix) -print("executable:", sys.executable) -print("path:", sys.path) -print("prefix:", sys.prefix) - -)RAW"; - -extern "C" PyObject* initModule(void); -extern "C" struct _frozen _PyImport_FrozenModules[]; -extern "C" struct _frozen _PyImport_FrozenModules_torch[]; - -static std::atomic s_id; -std::map forwards; - -__attribute__((constructor)) void init() { - -} - -void startup() { -#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name); - FOREACH_LIBRARY(APPEND_INIT) -#undef APPEND_INIT - PyImport_AppendInittab("torch._C", initModule); - - int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch); - TORCH_INTERNAL_ASSERT(ret == 0); - - PyPreConfig preconfig; - PyPreConfig_InitIsolatedConfig(&preconfig); - PyStatus status = Py_PreInitialize(&preconfig); - TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status)) - - PyConfig config; - PyConfig_InitIsolatedConfig(&config); - - // Completely blank out the path configuration. This ensures we have complete - // control of how our embedded Python searches for modules, and we will never - // consult the external filesystem. See: - // https://docs.python.org/3/c-api/init_config.html#path-configuration - config.site_import = 0; - - status = PyConfig_SetString(&config, &config.base_exec_prefix, L""); - status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy"); - status = PyConfig_SetString(&config, &config.base_prefix, L""); - status = PyConfig_SetString(&config, &config.exec_prefix, L""); - status = PyConfig_SetString(&config, &config.executable, L"torch_deploy"); - status = PyConfig_SetString(&config, &config.prefix, L""); - - - config.module_search_paths_set = 1; - std::array module_search_paths = {}; - status = PyConfig_SetWideStringList( - &config, &config.module_search_paths, 0, module_search_paths.data()); - - status = Py_InitializeFromConfig(&config); - PyConfig_Clear(&config); - TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status)) - - // Uncomment to debug python config - // PyRun_SimpleString(sysprint); - - PyRun_SimpleString(finder); - // Release the GIL that PyInitialize acquires - PyEval_SaveThread(); -} - -void teardown() { - PyGILState_Ensure(); - - if (Py_FinalizeEx() < 0) { - std::cout << "IT BROKE SO WE ARE EXITING\n"; - exit(120); - } - PyMem_RawFree(program); -} - -__attribute__((destructor)) void deinit() {} - -void run_some_python(const char* code) { - PyGILState_STATE gstate = PyGILState_Ensure(); - - if (PyRun_SimpleString(code) == -1) { - throw std::runtime_error("python eval failed\n"); - } - PyGILState_Release(gstate); -} - -void run_python_file(const char* code) { - PyGILState_STATE gstate = PyGILState_Ensure(); - - FILE* f = fopen(code, "r"); - if (PyRun_SimpleFile(f, code) == -1) { - throw std::runtime_error("python eval failed\n"); - } - fclose(f); - - PyGILState_Release(gstate); -} - - -size_t load_model(const char* filename, bool hermetic) { - PyGILState_STATE gstate = PyGILState_Ensure(); - TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1); - std::string code; - - if (hermetic) { - code = fmt::format(R"( -from torch.package import PackageImporter - -i = PackageImporter('{}') -model = i.load_pickle('model', 'model.pkl') -)", filename); - } else { - code = std::string("model = torch.jit.load('") + - std::string(filename) + std::string("')"); - } - py::exec(code); - - auto id = ++s_id; - - PyGILState_Release(gstate); - return id; -} - -at::Tensor forward_model(size_t model_id, at::Tensor const & input) { - at::Tensor output; - PyGILState_STATE gstate = PyGILState_Ensure(); - { - TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1); - auto forward = py::globals()["model"].attr("forward"); - - py::object py_output = forward(input); - // TODO is this going to leak? - // added it to prevent crash wehn using 'output' tensor in callee of - // forward() - py_output.inc_ref(); - output = py::cast(py_output); - } - - PyGILState_Release(gstate); - - return output; - // return input; -} diff --git a/torch/csrc/deploy/interpreter/interpreter.h b/torch/csrc/deploy/interpreter/interpreter.h deleted file mode 100644 index 29e435e44970..000000000000 --- a/torch/csrc/deploy/interpreter/interpreter.h +++ /dev/null @@ -1,67 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -class Interpreter : public InterpreterImpl { - private: - std::string library_name_; - void* handle_; - - public: - Interpreter() : handle_(nullptr) { - char library_name[L_tmpnam]; - library_name_ = library_name; - char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH"); - if (libinterpreter_path == nullptr) { - throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env."); - } - std::tmpnam(library_name); - { - std::ifstream src(libinterpreter_path, std::ios::binary); - std::ofstream dst(library_name, std::ios::binary); - dst << src.rdbuf(); - } - handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY); - if (!handle_) { - throw std::runtime_error(dlerror()); - } - - // technically, we can unlike the library right after dlopen, and this is - // better for cleanup because even if we crash the library doesn't stick - // around. However, its crap for debugging because gdb can't find the - // symbols if the library is no longer present. - unlink(library_name_.c_str()); - - void* initialize_interface = dlsym(handle_, "initialize_interface"); - if (!initialize_interface) { - throw std::runtime_error("Unable to load initialize_interface function from interpreter lib."); - } - ((void (*)(InterpreterImpl*))initialize_interface)(this); - - this->startup(); - - // the actual torch loading process is not thread safe, by doing it - // in the constructor before we have multiple worker threads, then we - // ensure it doesn't race. - run_some_python("import torch"); - } - ~Interpreter() { - if (handle_) { - this->teardown(); - - // it segfaults its face off trying to unload, but it's not clear - // if this is something we caused of if libtorch_python would also do the - // same if it were opened/closed a lot... - dlclose(handle_); - } - } - Interpreter(const Interpreter&) = delete; -}; diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h deleted file mode 100644 index 82326bd370f1..000000000000 --- a/torch/csrc/deploy/interpreter/interpreter_impl.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include - -// NOTE- if adding new interface functions, -// update interpreter.cpp initialize_interface. -size_t load_model(const char* model_file, bool hermetic=false); -at::Tensor forward_model(size_t model_id, at::Tensor const & input); -void run_some_python(const char* code); -void startup(); -void teardown(); -void run_python_file(const char* code); - - -#define FOREACH_INTERFACE_FUNCTION(_) \ - _(load_model) \ - _(forward_model) \ - _(run_some_python) \ - _(startup) \ - _(teardown) \ - _(run_python_file) - -struct InterpreterImpl { -#define DEFINE_POINTER(func) decltype(&::func) func; - FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER) -#undef DEFINE_POINTER -}; diff --git a/torch/csrc/deploy/interpreter/test_main.cpp b/torch/csrc/deploy/interpreter/test_main.cpp deleted file mode 100644 index 6107267c9f29..000000000000 --- a/torch/csrc/deploy/interpreter/test_main.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include -#include -#include -#include -#include -#include - -int main(int argc, char* argv[]) { - ::testing::InitGoogleTest(&argc, argv); - - int rc = RUN_ALL_TESTS(); - - return rc; -} - -TEST(Interpreter, Sanity) { - ASSERT_TRUE(true); -} - -TEST(Interpreter, Hello) { - Interpreter interp; - interp.run_some_python("print('hello from first interpeter!')"); - - Interpreter interp2; - interp2.run_some_python("print('hello from second interpeter!')"); -} - -void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) { - Interpreter interp; - // Test - auto model_id = interp.load_model(model_filename, false); - at::Tensor output = interp.forward_model(model_id, input); - - // Reference - auto ref_model = torch::jit::load(model_filename); - std::vector ref_inputs; - ref_inputs.emplace_back(torch::jit::IValue(input)); - at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor(); - - ASSERT_TRUE(ref_output.equal(output)); -} - -TEST(Interpreter, SimpleModel) { - char* model_path = std::getenv("SIMPLE_MODEL_PATH"); - ASSERT_NE(model_path, nullptr); - const int A = 10, B = 20; - compare_torchpy_jit( - model_path, torch::ones(at::IntArrayRef({A, B}))); -} diff --git a/torch/csrc/deploy/interpreter/third_party/README.md b/torch/csrc/deploy/interpreter/third_party/README.md deleted file mode 100644 index 2c5d9241d2bb..000000000000 --- a/torch/csrc/deploy/interpreter/third_party/README.md +++ /dev/null @@ -1,2 +0,0 @@ -Python libraries that we want to package along with the Python implementation -bundled in libinterpreter. diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index faddb8cb16e2..7286387644ad 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -113,10 +113,6 @@ def _lazy_call(callable): if is_initialized(): callable() else: - # TODO(torch_deploy): this accesses linecache, which attempts to read the - # file system to get traceback info. Patch linecache or do something - # else here if this ends up being important. - # Don't store the actual traceback to avoid memory cycle _queued_calls.append((callable, traceback.format_stack())) diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py index 73eb7f93cf1c..df6a3793e90d 100644 --- a/torch/utils/__init__.py +++ b/torch/utils/__init__.py @@ -2,7 +2,6 @@ from .throughput_benchmark import ThroughputBenchmark import os.path as _osp -import sys # Set the module for a given object for nicer printing def set_module(obj, mod): @@ -10,8 +9,5 @@ def set_module(obj, mod): raise TypeError("The mod argument should be a string") obj.__module__ = mod -if sys.executable == "torch_deploy": - # not valid inside torch_deploy interpreter, no paths exists for frozen modules - cmake_prefix_path = None -else: - cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake') +#: Path to folder containing CMake definitions for Torch package +cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')