From d126a0d4fd6f0b6da928135238cd2cc8f2072380 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 22 Sep 2020 01:45:44 -0700 Subject: [PATCH 001/449] [iOS] Disable the iOS nightly build until the cert issue has resolved (#45094) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45094 Test Plan: Imported from OSS Reviewed By: husthyc Differential Revision: D23831152 Pulled By: xta0 fbshipit-source-id: 6327edba01e4d5abad63ac35680eefb22276423f --- .circleci/cimodel/data/simple/nightly_ios.py | 2 +- .circleci/config.yml | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.circleci/cimodel/data/simple/nightly_ios.py b/.circleci/cimodel/data/simple/nightly_ios.py index 580dfa3d7ae8..6c01479dde80 100644 --- a/.circleci/cimodel/data/simple/nightly_ios.py +++ b/.circleci/cimodel/data/simple/nightly_ios.py @@ -60,7 +60,7 @@ def gen_tree(self): WORKFLOW_DATA = BUILD_CONFIGS + [ - IOSNightlyJob("binary", is_upload=True), + # IOSNightlyJob("binary", is_upload=True), ] diff --git a/.circleci/config.yml b/.circleci/config.yml index b32bb9b5086a..5ca2d725b9e9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7021,15 +7021,6 @@ workflows: ios_arch: arm64 ios_platform: OS name: pytorch_ios_11_2_1_nightly_arm64_build - - binary_ios_upload: - build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload - context: org-member - filters: - branches: - only: nightly - requires: - - pytorch_ios_11_2_1_nightly_x86_64_build - - pytorch_ios_11_2_1_nightly_arm64_build - pytorch_linux_build: build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32 docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c From c947ab0bb977dd995f0a96099b69fbab28377001 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Tue, 22 Sep 2020 02:02:29 -0700 Subject: [PATCH 002/449] Added sparse support for asin and neg functions, updated log1p (#44028) Summary: Description: - [x] added C++ code for sparse `asin` and `neg` ops similarly to `log1p` op - [x] added tests - [x] coalesced input CPU/CUDA - [x] uncoalesced input CPU/CUDA - [x] added tests for `negative` and `arcsin` Backprop will be addressed in another PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44028 Reviewed By: agolynski Differential Revision: D23793027 Pulled By: mruberry fbshipit-source-id: 5fd642808da8e528cf6acd608ca0dcd720c4ccc3 --- aten/src/ATen/native/UnaryOps.cpp | 6 +- aten/src/ATen/native/native_functions.yaml | 12 + .../ATen/native/sparse/SparseTensorMath.cpp | 56 ++++- test/test_sparse.py | 206 ++++++++++++++---- 4 files changed, 233 insertions(+), 47 deletions(-) diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 2764490f6d48..b5a6e2c017e7 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -405,9 +405,9 @@ Tensor& neg_out(Tensor& result, const Tensor& self) { Tensor neg(const Tensor& self) { return unary_op_impl(self, at::neg_out); } Tensor& neg_(Tensor& self) { return unary_op_impl_(self, at::neg_out); } -Tensor& negative_out(Tensor& result, const Tensor& self) { return at::native::neg_out(result, self); } -Tensor negative(const Tensor& self) { return at::native::neg(self); } -Tensor& negative_(Tensor& self) { return at::native::neg_(self); } +Tensor& negative_out(Tensor& result, const Tensor& self) { return at::neg_out(result, self); } +Tensor negative(const Tensor& self) { return self.neg(); } +Tensor& negative_(Tensor& self) { return self.neg_(); } Tensor logical_not(const Tensor& self) { Tensor result = at::empty({0}, self.options().dtype(kBool)); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 6541e45b3230..a84c9b4b61b8 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -553,8 +553,14 @@ - func: asin_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: asin_ + SparseCPU, SparseCUDA: asin_sparse_ - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asin_out + SparseCPU, SparseCUDA: asin_out_sparse # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor @@ -2716,8 +2722,14 @@ - func: neg_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: neg_ + SparseCPU, SparseCUDA: neg_sparse_ - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: neg_out + SparseCPU, SparseCUDA: neg_out_sparse # Alias for neg - func: negative(Tensor self) -> Tensor diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 9eee5e056dff..2bb5842b4726 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -95,16 +95,17 @@ SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scal // log1p(SparseTensor) // -------------------------------------------------------------------- -// TODO: add in-place variant +// In-place log1p on uncoalesced tensors is not supported since the operation is not a linear map. +// Values of uncoalesced tensor corresponding to the same indices are summed +// and log1p(summed_value) != log1p(v1) + log1p(v2) SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { - AT_ASSERT(r.is_sparse()); - AT_ASSERT(t.is_sparse()); + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); if (is_same_tensor(r, t)) { // don't have in-place log1p for uncoalesced input because coalesce() is not in-place - TORCH_CHECK( - r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); + TORCH_CHECK(r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported"); } else { copy_sparse_to_sparse_(r, t.coalesce()); @@ -114,10 +115,53 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { } SparseTensor& log1p_sparse_(SparseTensor& t) { - TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); return log1p_out_sparse(t, t); } +// -------------------------------------------------------------------- +// neg(SparseTensor) +// -------------------------------------------------------------------- + +SparseTensor& neg_out_sparse(SparseTensor& r, const SparseTensor& t) { + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); + + // copy_sparse_ does not perform the copy if it is the same tensor + copy_sparse_to_sparse_(r, t); + r._values().neg_(); + return r; +} + +SparseTensor& neg_sparse_(SparseTensor& t) { + return neg_out_sparse(t, t); +} + +// -------------------------------------------------------------------- +// asin(SparseTensor) +// -------------------------------------------------------------------- + +// In-place asin on uncoalesced tensors is not supported since the operation is not a linear map. +// Values of uncoalesced tensor corresponding to the same indices are summed +// and asin(summed_value) != asin(v1) + asin(v2) + +SparseTensor& asin_out_sparse(SparseTensor& r, const SparseTensor& t) { + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); + + if (is_same_tensor(r, t)) { + // don't have in-place asin for uncoalesced input because coalesce() is not in-place, see above comment + TORCH_CHECK(r.is_coalesced(), "asin: in-place on uncoalesced tensors is not supported"); + } else { + copy_sparse_to_sparse_(r, t.coalesce()); + } + r._values().asin_(); + return r; +} + +SparseTensor& asin_sparse_(SparseTensor& t) { + return asin_out_sparse(t, t); +} + // -------------------------------------------------------------------- // pow(SparseTensor, Scalar) // -------------------------------------------------------------------- diff --git a/test/test_sparse.py b/test/test_sparse.py index 64846ab729eb..af833be6810c 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -6,6 +6,7 @@ import itertools import functools +import operator import random import unittest from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \ @@ -1728,53 +1729,182 @@ def test_narrow(self): self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3)) # dim > sparseDim + denseDim - def _test_log1p_tensor(self, input, dense_tensor): + def _test_log1p_tensor(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() expected_output = dense_tensor.log1p() - self.assertEqual(expected_output, input.log1p().to_dense()) - self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) - # test in-place op on uncoalesced input - with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"): - input.log1p_() + self.assertEqual(expected_output, sparse_tensor.log1p().to_dense()) + self.assertEqual(expected_output, sparse_tensor.coalesce().log1p_().to_dense()) - input.requires_grad_() - self.assertTrue(input.requires_grad) + if self.is_uncoalesced: + # test in-place op on uncoalesced input + with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"): + sparse_tensor.log1p_() + + sparse_tensor.requires_grad_() + self.assertTrue(sparse_tensor.requires_grad) # test autograd - x = input.clone() - y = input.log1p() + x = sparse_tensor.clone() + y = sparse_tensor.log1p() with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"): y.backward(x) def test_log1p(self): - input = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2]]).transpose(1, 0).clone().detach(), - torch.FloatTensor([3, 4, 5]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input, torch.as_tensor([3, 4, 5], dtype=torch.float32)) - - # test uncoalesced input - input_uncoalesced = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0).clone().detach(), - torch.FloatTensor([2, 3, 4, 1, 1, 1]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input_uncoalesced, torch.as_tensor([3, 4, 5], dtype=torch.float32)) - - input = torch.sparse_coo_tensor( - torch.zeros([2, 0]), - torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), - torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0])) - - input = torch.sparse_coo_tensor( - torch.zeros([1, 5]), - torch.zeros([5, 6, 0]), - torch.Size([5, 6, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([5, 6, 0])) + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2]]).transpose(1, 0), + values=torch.tensor([3.0, 4.0, 5.0]), + size=[3, ], + device=self.device + ).coalesce() + self._test_log1p_tensor(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[1.0, 3.0], [5.0, 7.0]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_log1p_tensor(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([2.0, 3.0, 4.0, 1.0, 1.0, 1.0]), + size=[3, ], + device=self.device + ) + self._test_log1p_tensor(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_log1p_tensor(input_uncoalesced) + + def _test_neg_negative(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() + expected_output = dense_tensor.neg() + + ops = ( + torch.neg, torch.Tensor.neg, torch.Tensor.neg_, + torch.negative, torch.Tensor.negative, torch.Tensor.negative_, + operator.neg + ) + for op in ops: + sparse_tensor_copy = sparse_tensor.clone() + self.assertEqual(expected_output, op(sparse_tensor_copy).to_dense()) + + if op in (torch.neg, torch.negative): + sparse_tensor_out = torch.zeros_like(sparse_tensor) + op(sparse_tensor, out=sparse_tensor_out) + self.assertEqual(expected_output, sparse_tensor_out.to_dense()) + + def test_neg_negative(self): + + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0, 1, 2]]), + values=torch.tensor([3.0, -4.0, 5.0]), + size=[3, ], + device=self.device + ).coalesce() + self._test_neg_negative(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[-1.0, 3.0], [-5.0, 7.0]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_neg_negative(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([2.0, -3.0, -4.0, 1.0, -1.0, 1.5]), + size=[3, ], + device=self.device + ) + self._test_neg_negative(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_neg_negative(input_uncoalesced) + + def _test_asin_arcsin(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() + expected_output = dense_tensor.asin() + + ops = ( + torch.asin, torch.Tensor.asin, + torch.arcsin, torch.Tensor.arcsin, + ) + for op in ops: + self.assertEqual(expected_output, op(sparse_tensor).to_dense()) + if op in (torch.asin, torch.arcsin): + sparse_tensor_out = torch.zeros_like(sparse_tensor) + op(sparse_tensor, out=sparse_tensor_out) + self.assertEqual(expected_output, sparse_tensor_out.to_dense()) + + for op in (torch.Tensor.asin_, torch.Tensor.arcsin_): + self.assertEqual(expected_output, op(sparse_tensor.clone().coalesce()).to_dense()) + if self.is_uncoalesced: + # test in-place op on uncoalesced input + with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"): + op(sparse_tensor) + + def test_asin_arcsin(self): + + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0, 1, 2, 3]]), + values=torch.tensor([0.5, -0.5, 0.7, -0.7]), + size=[4, ], + device=self.device + ).coalesce() + self._test_asin_arcsin(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[-0.1, 0.24], [-0.44, 0.1]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_asin_arcsin(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([0.3, -0.3, -0.4, 0.3, -0.5, 0.15]), + size=[3, ], + device=self.device + ) + self._test_asin_arcsin(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_asin_arcsin(input_uncoalesced) def test_mv(self): def test_shape(di, dj, dk, nnz): From 339961187a9750e8d5f10954ce78ea8cf819987c Mon Sep 17 00:00:00 2001 From: Jiakai Liu Date: Tue, 22 Sep 2020 03:18:30 -0700 Subject: [PATCH 003/449] [pytorch] refine dispatch keys in native_functions.yaml (1/N) (#45010) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45010 The motivation of this change is to differentiate "backend specific" ops and "generic" ops. "backend specific" ops are those invoking backend specific kernels thus only able to run on certain backends, e.g.: CPU, CUDA. "generic" ops are those not *directly* invoking backend specific kernels. They are usually calling other "backend specific" ops to get things done. Thus, they are also referred to as "composite" ops, or "math" ops (because they are usually pure C++ code constructed from math formula). The other way to see the difference is that: we have to implement new kernels for the "backend specific" ops if we want to run these ops on a new backend. In contrast, "generic"/"composite" ops can run on the new backend if we've added support for all the "backend specific" ops to which they delegate their work. Historically we didn't make a deliberate effort to always populate supported backends to the "dispatch" section for all the "backend specific" ops in native_functions.yaml. So now there are many ops which don't have "dispatch" section but are actually "backend specific" ops. Majority of them are calling "DispatchStub" kernels, which usually only support CPU/CUDA (via TensorIterator) or QuantizedCPU/CUDA. The ultimate goal is to be able to differentiate these two types of ops by looking at the "dispatch" section in native_functions.yaml. This PR leveraged the analysis script on #44963 to populate missing dispatch keys for a set of "backend specific" ops. As the initial step, we only deal with the simplest case: * These ops don't already have dispatch section in native_functions.yaml; * These ops call one or more DispatchStub (thus "backend specific"); * These ops don't call any other aten ops - except for some common ones almost every op calls via framework, e.g. calling aten::eq via Dispatcher::checkSchemaCompatibility. Calling other nontrivial aten ops is a sign of being "composite", so we don't want to deal with this case now; * These ops don't call Tensor::is_quantized() / Tensor::is_sparse() / etc. Some ops call thse Tensor::is_XXX() methods to dispatch to quantized / sparse kernels internally. We don't deal with this case now. Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D23803951 Pulled By: ljk53 fbshipit-source-id: aaced7c34427d1ede72380af4513508df366ea16 --- aten/src/ATen/native/native_functions.yaml | 142 ++++++++++++++++++++- 1 file changed, 137 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a84c9b4b61b8..8aac7483ff2a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -167,13 +167,13 @@ - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) variants: function dispatch: - CUDA: fused_dropout_cuda + CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor use_c10_dispatcher: full variants: function dispatch: - CUDA: masked_scale_cuda + CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -290,6 +290,8 @@ variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor use_c10_dispatcher: full @@ -304,6 +306,8 @@ variants: function, method - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acos_out # arccos, alias of acos - func: arccos(Tensor self) -> Tensor @@ -480,6 +484,8 @@ variants: function, method - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acosh_out # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor @@ -501,6 +507,8 @@ variants: function, method - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asinh_out # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor @@ -522,6 +530,8 @@ variants: function, method - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atanh_out # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor @@ -582,6 +592,8 @@ variants: function, method - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan_out # arctan, alias of atan - func: arctan(Tensor self) -> Tensor @@ -673,6 +685,8 @@ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -917,12 +931,16 @@ variants: function - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor use_c10_dispatcher: full variants: function - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor use_c10_dispatcher: full @@ -996,6 +1014,8 @@ variants: function, method - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1006,6 +1026,8 @@ variants: function, method - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1189,7 +1211,7 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: - CPU: ctc_loss_cpu + CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor @@ -1455,6 +1477,8 @@ variants: function, method - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1465,6 +1489,8 @@ variants: function, method - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1475,6 +1501,8 @@ variants: function, method - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1485,6 +1513,8 @@ variants: function, method - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1599,6 +1629,8 @@ variants: function, method - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: frac_out - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -1617,6 +1649,8 @@ CPU: from_file - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1627,6 +1661,8 @@ variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1994,12 +2030,16 @@ CPU, CUDA: log2_out - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2714,6 +2754,8 @@ variants: function, method - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2741,6 +2783,8 @@ variants: function, method - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: negative_out - func: repeat(Tensor self, int[] repeats) -> Tensor use_c10_dispatcher: full @@ -2900,6 +2944,8 @@ - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -2921,6 +2967,8 @@ MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full @@ -2935,6 +2983,8 @@ CPU, CUDA: logit_ - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2957,6 +3007,8 @@ variants: function, method - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sinh_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -3167,6 +3219,8 @@ variants: function, method - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3236,6 +3290,8 @@ variants: function, method - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3249,6 +3305,8 @@ variants: function, method - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor use_c10_dispatcher: full @@ -3596,8 +3654,8 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: _sparse_sum_backward_cpu - SparseCUDA: _sparse_sum_backward_cuda + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -4799,6 +4857,8 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) use_c10_dispatcher: full @@ -4817,6 +4877,8 @@ - func: digamma_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) use_c10_dispatcher: full @@ -4906,27 +4968,41 @@ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: geometric_ # wrappers for TH functions @@ -5380,6 +5456,8 @@ use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5390,6 +5468,8 @@ variants: method - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5609,12 +5689,16 @@ CPU, CUDA: lgamma - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor use_c10_dispatcher: full @@ -5647,6 +5731,8 @@ variants: function, method - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5674,6 +5760,8 @@ variants: method, function - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -5740,19 +5828,27 @@ CUDA: fmod_cuda - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: hypot - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: nextafter - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method @@ -6477,10 +6573,14 @@ - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6490,6 +6590,8 @@ - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6523,6 +6625,8 @@ - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6534,6 +6638,8 @@ - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6562,6 +6668,8 @@ - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6572,14 +6680,20 @@ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6603,6 +6717,8 @@ - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) use_c10_dispatcher: full @@ -6668,10 +6784,14 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6681,13 +6801,19 @@ - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6697,6 +6823,8 @@ - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7468,6 +7596,8 @@ - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: logit_backward - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7477,6 +7607,8 @@ - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: tanh_backward # What's a thnn_conv_ versus a slow_conv_? # From 71aeb84ab491ae005b6f4caf84b3892edaf968ad Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Tue, 22 Sep 2020 05:40:03 -0700 Subject: [PATCH 004/449] Revert D23803951: [pytorch] refine dispatch keys in native_functions.yaml (1/N) Test Plan: revert-hammer Differential Revision: D23803951 (https://github.com/pytorch/pytorch/commit/339961187a9750e8d5f10954ce78ea8cf819987c) Original commit changeset: aaced7c34427 fbshipit-source-id: fcc4fb6a2c1d79b587f62347b43f8851fe1647fd --- aten/src/ATen/native/native_functions.yaml | 142 +-------------------- 1 file changed, 5 insertions(+), 137 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8aac7483ff2a..a84c9b4b61b8 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -167,13 +167,13 @@ - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) variants: function dispatch: - CUDA: fused_dropout_cuda + CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor use_c10_dispatcher: full variants: function dispatch: - CUDA: masked_scale_cuda + CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -290,8 +290,6 @@ variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor use_c10_dispatcher: full @@ -306,8 +304,6 @@ variants: function, method - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: acos_out # arccos, alias of acos - func: arccos(Tensor self) -> Tensor @@ -484,8 +480,6 @@ variants: function, method - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: acosh_out # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor @@ -507,8 +501,6 @@ variants: function, method - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: asinh_out # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor @@ -530,8 +522,6 @@ variants: function, method - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: atanh_out # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor @@ -592,8 +582,6 @@ variants: function, method - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: atan_out # arctan, alias of atan - func: arctan(Tensor self) -> Tensor @@ -685,8 +673,6 @@ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: bernoulli_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -931,16 +917,12 @@ variants: function - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor use_c10_dispatcher: full variants: function - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor use_c10_dispatcher: full @@ -1014,8 +996,6 @@ variants: function, method - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1026,8 +1006,6 @@ variants: function, method - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1211,7 +1189,7 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: - CPU: ctc_loss_cpu + CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor @@ -1477,8 +1455,6 @@ variants: function, method - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1489,8 +1465,6 @@ variants: function, method - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1501,8 +1475,6 @@ variants: function, method - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1513,8 +1485,6 @@ variants: function, method - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1629,8 +1599,6 @@ variants: function, method - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: frac_out - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -1649,8 +1617,6 @@ CPU: from_file - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1661,8 +1627,6 @@ variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2030,16 +1994,12 @@ CPU, CUDA: log2_out - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2754,8 +2714,6 @@ variants: function, method - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2783,8 +2741,6 @@ variants: function, method - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: negative_out - func: repeat(Tensor self, int[] repeats) -> Tensor use_c10_dispatcher: full @@ -2944,8 +2900,6 @@ - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -2967,8 +2921,6 @@ MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full @@ -2983,8 +2935,6 @@ CPU, CUDA: logit_ - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3007,8 +2957,6 @@ variants: function, method - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: sinh_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -3219,8 +3167,6 @@ variants: function, method - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3290,8 +3236,6 @@ variants: function, method - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3305,8 +3249,6 @@ variants: function, method - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor use_c10_dispatcher: full @@ -3654,8 +3596,8 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: _sparse_sum_backward_cpu - SparseCUDA: _sparse_sum_backward_cuda + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -4857,8 +4799,6 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full variants: method - dispatch: - CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) use_c10_dispatcher: full @@ -4877,8 +4817,6 @@ - func: digamma_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: method - dispatch: - CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) use_c10_dispatcher: full @@ -4968,41 +4906,27 @@ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: - CPU, CUDA: geometric_ # wrappers for TH functions @@ -5456,8 +5380,6 @@ use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5468,8 +5390,6 @@ variants: method - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5689,16 +5609,12 @@ CPU, CUDA: lgamma - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor use_c10_dispatcher: full @@ -5731,8 +5647,6 @@ variants: function, method - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5760,8 +5674,6 @@ variants: method, function - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -5828,27 +5740,19 @@ CUDA: fmod_cuda - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - dispatch: - CPU, CUDA: hypot - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - dispatch: - CPU, CUDA: nextafter - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method @@ -6573,14 +6477,10 @@ - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: elu - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6590,8 +6490,6 @@ - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6625,8 +6523,6 @@ - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6638,8 +6534,6 @@ - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6668,8 +6562,6 @@ - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6680,20 +6572,14 @@ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6717,8 +6603,6 @@ - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) use_c10_dispatcher: full @@ -6784,14 +6668,10 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: softplus - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6801,19 +6681,13 @@ - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - dispatch: - CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: softshrink - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6823,8 +6697,6 @@ - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7596,8 +7468,6 @@ - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: logit_backward - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7607,8 +7477,6 @@ - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn - dispatch: - CPU, CUDA: tanh_backward # What's a thnn_conv_ versus a slow_conv_? # From 1b059f2c6de83a9c2445ab3fb21d10893f66a839 Mon Sep 17 00:00:00 2001 From: Bugra Akyildiz Date: Tue, 22 Sep 2020 06:27:06 -0700 Subject: [PATCH 005/449] Directly use work.result() to retrieve tensor rather than passing as a separate argument (#44914) Summary: We currently are fetching an allreduced tensor from Python in C++ in, where we are storing the resulting tensor in a struct's parameter. This PR removes extra tensor paratemeter in the function parameter and fetch from a single place. Fixes https://github.com/pytorch/pytorch/issues/43960 Pull Request resolved: https://github.com/pytorch/pytorch/pull/44914 Reviewed By: rohan-varma Differential Revision: D23798888 Pulled By: bugra fbshipit-source-id: ad1b8c31c15e3758a57b17218bbb9dc1f61f1577 --- torch/csrc/distributed/c10d/reducer.cpp | 15 +++++++-------- torch/csrc/distributed/c10d/reducer.h | 1 - torch/nn/parallel/distributed.py | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index ac4e735af94a..a895bea5fc26 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -425,11 +425,9 @@ std::vector> Reducer::get_bucket_tensors() const { void Reducer::set_forward_pass_work_handle( std::shared_ptr forwardPassWorkHandle, - at::Tensor& tensor, bool useStaticWorldSize) { std::lock_guard lock(mutex_); forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle); - forwardPassWorkHandle_.resultTensor = tensor; forwardPassWorkHandle_.useStaticWorldSize = useStaticWorldSize; } @@ -573,12 +571,13 @@ void Reducer::mark_variable_ready(VariableIndex index) { if (divFactor_ == kUnsetDivFactor) { divFactor_ = process_group_->getSize(); auto& workHandle = forwardPassWorkHandle_.workHandle; - if (workHandle) { - if (!forwardPassWorkHandle_.useStaticWorldSize) { - workHandle->wait(); - at::Tensor& res = forwardPassWorkHandle_.resultTensor; - divFactor_ = res.item().to(); - } + if (workHandle && !forwardPassWorkHandle_.useStaticWorldSize) { + workHandle->wait(); + auto results = workHandle->result(); + // Guard against the results being empty + TORCH_INTERNAL_ASSERT(results.size() > 0); + at::Tensor& res = results.front(); + divFactor_ = res.item().to(); } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 87ad60330af7..d45e5c2b90e1 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -89,7 +89,6 @@ class Reducer { // corresponding tensor being reduced. void set_forward_pass_work_handle( std::shared_ptr forwardPassWorkHandle, - at::Tensor& tensor, bool useStaticWorldSize); // Retrieve on-device tensors used to track locally unused parameters. For diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 1425f73dd365..44f5e6fe2ccb 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -581,7 +581,7 @@ def forward(self, *inputs, **kwargs): ) work = dist.all_reduce(ones, group=self.process_group, async_op=True) self.reducer._set_forward_pass_work_handle( - work, ones, self.ddp_join_divide_by_initial_world_size + work, self.ddp_join_divide_by_initial_world_size ) # Calling _rebuild_buckets before forward compuation, From 58b6ab69e5100adc01dd1bf272e70279ba6ae012 Mon Sep 17 00:00:00 2001 From: anjali411 Date: Tue, 22 Sep 2020 08:01:16 -0700 Subject: [PATCH 006/449] torch.sgn for complex tensors (#39955) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/39955 resolves https://github.com/pytorch/pytorch/issues/36323 by adding `torch.sgn` for complex tensors. `torch.sgn` returns `x/abs(x)` for `x != 0` and returns `0 + 0j` for `x==0` This PR doesn't test the correctness of the gradients. It will be done as a part of auditing all the ops in future once we decide the autograd behavior (JAX vs TF) and add gradchek. Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D23460526 Pulled By: anjali411 fbshipit-source-id: 70fc4e14e4d66196e27cf188e0422a335fc42f92 --- aten/src/ATen/core/aten_interned_strings.h | 1 + aten/src/ATen/cpu/vec256/vec256_base.h | 7 +++++ .../ATen/cpu/vec256/vec256_complex_double.h | 10 +++++++ .../ATen/cpu/vec256/vec256_complex_float.h | 10 +++++++ aten/src/ATen/native/UnaryOps.cpp | 12 ++++++++ aten/src/ATen/native/UnaryOps.h | 1 + aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 26 ++++++++++++----- aten/src/ATen/native/cpu/zmath.h | 9 ++++++ aten/src/ATen/native/cuda/UnarySignKernels.cu | 17 +++++++++++ aten/src/ATen/native/native_functions.yaml | 9 ++++++ .../operator_benchmark/pt/unary_test.py | 1 + docs/source/name_inference.rst | 2 ++ docs/source/tensors.rst | 2 ++ test/test_autograd.py | 2 +- test/test_torch.py | 15 ++++++++++ tools/autograd/derivatives.yaml | 11 ++++--- torch/_tensor_docs.py | 14 +++++++++ torch/_torch_docs.py | 25 ++++++++++++++++ torch/csrc/autograd/FunctionsManual.cpp | 29 +++++++++++++++++++ torch/csrc/autograd/FunctionsManual.h | 3 ++ torch/overrides.py | 1 + .../_internal/common_methods_invocations.py | 2 ++ 22 files changed, 196 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index c5e4b0ea3c01..4fa49302240b 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -611,6 +611,7 @@ _(aten, sigmoid) \ _(aten, sign) \ _(aten, signbit) \ _(aten, silu) \ +_(aten, sgn) \ _(aten, sin) \ _(aten, sinh) \ _(aten, size) \ diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index 0e66cb357965..49acbc518dca 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -239,6 +239,13 @@ struct Vec256 { // Specifically map() does not perform the type conversion needed by abs. return map([](T x) { return static_cast(std::abs(x)); }); } + + template ::value, int>::type = 0> + Vec256 sgn() const { + return map(at::native::sgn_impl); + } + template ::value, int>::type = 0> Vec256 angle() const { diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h index fbc7a480a4c0..0827b33a3122 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h @@ -134,6 +134,16 @@ template <> class Vec256> { auto angle = _mm256_permute_pd(angle_(), 0x05); // angle 90-angle return _mm256_and_pd(angle, real_mask); // angle 0 } + Vec256> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_pd(); + auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); + auto abs_val = Vec256(abs); + + auto div = values / abs_val.values; // x / abs(x) + + return blendv(div, zero, mask); + } __m256d real_() const { const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h index 892345e9d5c5..ea931acc494b 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h @@ -171,6 +171,16 @@ template <> class Vec256> { auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle 90-angle return _mm256_and_ps(angle, real_mask); // angle 0 } + Vec256> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_ps(); + auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); + auto abs_val = Vec256(abs); + + auto div = values / abs_val.values; // x / abs(x) + + return _mm256_blendv_ps(div, zero, mask); + } __m256 real_() const { const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index b5a6e2c017e7..f9af400ba2f4 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -301,6 +301,17 @@ Tensor& sign_out(Tensor& result, const Tensor& self) { return unary_op_impl_out( Tensor sign(const Tensor& self) { return unary_op_impl(self, at::sign_out); } Tensor& sign_(Tensor& self) { return unary_op_impl_(self, at::sign_out); } +Tensor& sgn_out(Tensor& result, const Tensor& self) { + if (self.is_complex()) { + return unary_op_impl_out(result, self, sgn_stub); + } else { + return unary_op_impl_out(result, self, sign_stub); + } +} + +Tensor sgn(const Tensor& self) { return unary_op_impl(self, at::sgn_out); } +Tensor& sgn_(Tensor& self) { return unary_op_impl_(self, at::sgn_out); } + Tensor& sin_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sin_stub); } Tensor sin(const Tensor& self) { return unary_op_impl(self, at::sin_out); } Tensor& sin_(Tensor& self) { return unary_op_impl_(self, at::sin_out); } @@ -639,6 +650,7 @@ DEFINE_DISPATCH(sigmoid_stub); DEFINE_DISPATCH(logit_stub); DEFINE_DISPATCH(sign_stub); DEFINE_DISPATCH(signbit_stub); +DEFINE_DISPATCH(sgn_stub); DEFINE_DISPATCH(sin_stub); DEFINE_DISPATCH(sinh_stub); DEFINE_DISPATCH(sqrt_stub); diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index fa172cb58b38..0dcd5a0b9473 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -53,6 +53,7 @@ DECLARE_DISPATCH(unary_fn, sigmoid_stub); DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub); DECLARE_DISPATCH(unary_fn, sign_stub); DECLARE_DISPATCH(unary_fn, signbit_stub); +DECLARE_DISPATCH(unary_fn, sgn_stub); DECLARE_DISPATCH(unary_fn, sin_stub); DECLARE_DISPATCH(unary_fn, sinh_stub); DECLARE_DISPATCH(unary_fn, sqrt_stub); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index adf300522692..45c7e4e23762 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -270,16 +270,16 @@ static void sign_kernel(TensorIterator& iter){ auto one_vec = Vec256(static_cast(1)); cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); }, - [=](Vec256 self_vec){ + iter, + [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); }, + [=](Vec256 self_vec){ - // Comparision operators returns bitmask. - auto left = Vec256::blendv(zero_vec, one_vec, zero_vec < self_vec); - auto right = Vec256::blendv(zero_vec, one_vec, self_vec < zero_vec); + // Comparision operators returns bitmask. + auto left = Vec256::blendv(zero_vec, one_vec, zero_vec < self_vec); + auto right = Vec256::blendv(zero_vec, one_vec, self_vec < zero_vec); - return left - right; - }); + return left - right; + }); }); } } @@ -290,6 +290,15 @@ static void signbit_kernel(TensorIterator& iter){ }); } +static void sgn_kernel(TensorIterator& iter){ + AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), 'sgn_cpu', [&]() { + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { return sgn_impl(a); }, + [=](Vec256 a) { return a.sgn(); }); + }); +} + static void sinh_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() { cpu_kernel_vec( @@ -639,6 +648,7 @@ REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel); REGISTER_DISPATCH(neg_stub, &neg_kernel); REGISTER_DISPATCH(sign_stub, &sign_kernel); REGISTER_DISPATCH(signbit_stub, &signbit_kernel); +REGISTER_DISPATCH(sgn_stub, &sgn_kernel); REGISTER_DISPATCH(sinh_stub, &sinh_kernel); REGISTER_DISPATCH(cosh_stub, &cosh_kernel); REGISTER_DISPATCH(acosh_stub, &acosh_kernel); diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h index d6816f4dd182..e0554e0cbc29 100644 --- a/aten/src/ATen/native/cpu/zmath.h +++ b/aten/src/ATen/native/cpu/zmath.h @@ -138,6 +138,15 @@ inline c10::complex ceil_impl (c10::complex z) { return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); } +template +inline c10::complex sgn_impl (c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / zabs(z); + } +} + template inline TYPE floor_impl (TYPE z) { return std::floor(z); diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu index 3d90089556be..cd02c89f23f0 100644 --- a/aten/src/ATen/native/cuda/UnarySignKernels.cu +++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu @@ -51,9 +51,26 @@ void signbit_kernel_cuda(TensorIterator& iter){ }); } +template +__host__ __device__ static inline c10::complex sgn_wrapper(c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / std::abs(z); + } +} + +void sgn_kernel_cuda(TensorIterator& iter){ + AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return sgn_wrapper(a); + }); + }); +} REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda); REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda); REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda); REGISTER_DISPATCH(signbit_stub, &signbit_kernel_cuda); +REGISTER_DISPATCH(sgn_stub, &sgn_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a84c9b4b61b8..3244522f1808 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -277,6 +277,15 @@ use_c10_dispatcher: full variants: function +- func: sgn(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: sgn_(Tensor(a!) self) -> Tensor(a!) + variants: method + +- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + - func: real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: function diff --git a/benchmarks/operator_benchmark/pt/unary_test.py b/benchmarks/operator_benchmark/pt/unary_test.py index 4a8a7865330b..1391283b1e10 100644 --- a/benchmarks/operator_benchmark/pt/unary_test.py +++ b/benchmarks/operator_benchmark/pt/unary_test.py @@ -91,6 +91,7 @@ def forward(self): ['sigmoid', torch.sigmoid], ['sigmoid_', torch.sigmoid_], ['sign', torch.sign], + ['sgn', torch.sgn], ['sin', torch.sin], ['sin_', torch.sin_], ['sinh', torch.sinh], diff --git a/docs/source/name_inference.rst b/docs/source/name_inference.rst index 7fc84e092633..ccbb8c0c54d3 100644 --- a/docs/source/name_inference.rst +++ b/docs/source/name_inference.rst @@ -197,6 +197,8 @@ If you don't see an operation listed here, but it would help your use case, plea :meth:`Tensor.sigmoid_`,None ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc` :meth:`Tensor.sign_`,None + ":meth:`Tensor.sgn`, :func:`torch.sgn`",:ref:`keeps_input_names-doc` + :meth:`Tensor.sgn_`,None ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc` :meth:`Tensor.sin_`,None ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc` diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index d7a94711e76b..cd1c363604fe 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -532,6 +532,8 @@ view of a storage and defines numeric operations on it. .. automethod:: sign .. automethod:: sign_ .. automethod:: signbit + .. automethod:: sgn + .. automethod:: sgn_ .. automethod:: sin .. automethod:: sin_ .. automethod:: sinh diff --git a/test/test_autograd.py b/test/test_autograd.py index 9d037fd7c138..938a41c2c089 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -4692,7 +4692,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu', 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'round', 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh', - 'cosh', '__rmul__'] + separate_complex_tests + 'cosh', '__rmul__', 'sgn'] + separate_complex_tests # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411 # complex_list += ['fill_', 't', '__rdiv__', 'tanh'] diff --git a/test/test_torch.py b/test/test_torch.py index a2f5f21dab1e..c8dfd5115333 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -11288,6 +11288,19 @@ def test_signbit_complex(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'signbit is not implemented for complex tensors.'): torch.signbit(t, out=out) + @dtypes(torch.cfloat, torch.cdouble) + def test_sgn(self, device, dtype): + x = torch.randn(100, dtype=dtype) + angle = x.angle() + out = x.sgn() + self.assertEqual(out.angle(), angle) + self.assertEqual(out.abs(), torch.ones_like(x).real) + + x_out = torch.empty_like(x) + torch.sgn(x, out=x_out) + self.assertEqual(x_out.angle(), angle) + self.assertEqual(x_out.abs(), torch.ones_like(x).real) + @dtypes(*(torch.testing.get_all_dtypes(include_bool=False))) def test_signbit_non_boolean_output(self, device, dtype): # test non-boolean tensors as the `out=` parameters @@ -14709,6 +14722,8 @@ def _test_helper(x, y, bias, memory_format): lambda x, y: x.logit_(1e-6), lambda x, y: x.sign(), lambda x, y: x.sign_(), + lambda x, y: x.sgn(), + lambda x, y: x.sgn_(), lambda x, y: x.sin(), lambda x, y: x.sin_(), lambda x, y: x.sinh(), diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 9ee296e83035..70ddaee5226f 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -159,7 +159,7 @@ # NB: The parameter names here MUST be consistent with the parameter names # in Decalarations.yaml - name: abs(Tensor self) -> Tensor - self: grad * self.sign() + self: grad * self.sgn() - name: acos(Tensor self) -> Tensor self: grad * -((-self * self + 1).rsqrt()) @@ -397,11 +397,11 @@ # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414 # Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later - name: div.Tensor(Tensor self, Tensor other) -> Tensor - self: grad / other - other: -grad * (self / other) / other + self: div_tensor_self_backward(grad, other, self.scalar_type()) + other: div_tensor_other_backward(grad, self, other) - name: div.Scalar(Tensor self, Scalar other) -> Tensor - self: grad / other + self: div_tensor_self_backward(grad, at::scalar_to_tensor(other), self.scalar_type()) - name: dot(Tensor self, Tensor tensor) -> Tensor self: grad * tensor @@ -928,6 +928,9 @@ - name: sign(Tensor self) -> Tensor self: zeros_like(grad) +- name: sgn(Tensor self) -> Tensor + self: sgn_backward(result, grad, self) + - name: sin(Tensor self) -> Tensor self: grad * self.cos().conj() diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 2a83aeca0de8..55c5613cdcc3 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -3121,6 +3121,20 @@ def callable(a, b) -> number See :func:`torch.signbit` """) +add_docstr_all('sgn', + r""" +sgn() -> Tensor + +See :func:`torch.sgn` +""") + +add_docstr_all('sgn_', + r""" +sgn_() -> Tensor + +In-place version of :meth:`~Tensor.sgn` +""") + add_docstr_all('sin', r""" sin() -> Tensor diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index d0f6f8c92151..5a3b2339fde5 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -6603,6 +6603,31 @@ def merge_dicts(*dicts): tensor([ False, True, False, False]) """.format(**common_args)) +add_docstr(torch.sgn, + r""" +sgn(input, *, out=None) -> Tensor + +For complex tensors, this function returns a new tensor whose elemants have the same angle as that of the +elements of :attr:`input` and absolute value 1. For a non-complex tensor, this function +returns the signs of the elements of :attr:`input` (see :func:`torch.sign`). + +:math:`\text{out}_{i} = 0`, if :math:`|{\text{{input}}_i}| == 0` +:math:`\text{out}_{i} = \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|}`, otherwise + +""" + r""" +Args: + {input} + +Keyword args: + {out} + +Example:: + + >>> x=torch.tensor([3+4j, 7-24j, 0, 1+2j]) + >>> x.sgn() + tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j]) +""".format(**common_args)) + add_docstr(torch.sin, r""" sin(input, out=None) -> Tensor diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 29f0720fb3c7..1e73ebac2a2a 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -211,6 +211,17 @@ Tensor mvlgamma_backward(Tensor grad, const Tensor & self, int64_t p) { return grad * args.digamma_().sum(-1); } +Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) { + if (self.is_complex()) { + auto abs = at::abs(self); + // C -> C + // https://arxiv.org/pdf/1701.00392.pdf Section 4.20 + return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result))); + } else { + return at::zeros_like(grad, at::MemoryFormat::Preserve); + } +} + Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) { auto result = grad * other.conj(); if (!at::isComplexType(self_st) && result.is_complex()) { @@ -220,6 +231,24 @@ Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) { return result; } +Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) { + auto result = grad / other.conj(); + if (!at::isComplexType(self_st) && result.is_complex()) { + // R -> C + result = at::real(result); + } + return result; +} + +Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) { + auto result = -grad * ((self / other) / other).conj(); + if (!other.is_complex() && result.is_complex()) { + // R -> C + result = at::real(result); + } + return result; +} + Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) { // invert the permutation auto ndims = fwd_dims.size(); diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index b4e7d1667f88..8fd0e9b08cc4 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -44,6 +44,8 @@ at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at: at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result); at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result); at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st); +at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st); +at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other); at::Tensor mvlgamma_backward(at::Tensor grad, const at::Tensor & self, int64_t p); at::Tensor permute_backwards(const at::Tensor & grad, at::IntArrayRef fwd_dims); at::Tensor rad2deg_backward(const at::Tensor& grad); @@ -74,6 +76,7 @@ at::Tensor sum_tensorlist(at::TensorList tl); at::Tensor repeat_backward(at::Tensor grad, int64_t input_dims, at::IntArrayRef repeats); at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m); at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value); +at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self); at::Tensor var_backward(const at::Tensor & grad, const at::Tensor & self, bool unbiased); at::Tensor var_backward(at::Tensor grad, const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim); at::Tensor std_backward(const at::Tensor & result, const at::Tensor & grad, const at::Tensor & self, bool unbiased); diff --git a/torch/overrides.py b/torch/overrides.py index 60f615bb1b0e..d17c6c4f7473 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -701,6 +701,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.sigmoid: lambda input, out=None: -1, torch.sign: lambda input, out=None: -1, torch.signbit: lambda input, out=None: -1, + torch.sgn: lambda input, out=None: -1, torch.sin: lambda input, out=None: -1, torch.sinh: lambda input, out=None: -1, torch.slogdet: lambda input: -1, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 643261461fc8..dd429deacbf0 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -696,6 +696,8 @@ def method_tests(): ('round', (), NO_ARGS, 'scalar', (True,)), ('sign', (S, S, S), NO_ARGS), ('sign', (), NO_ARGS, 'scalar'), + ('sgn', (S, S, S), NO_ARGS), + ('sgn', (), NO_ARGS, 'scalar'), ('trunc', (S, S, S), NO_ARGS, '', (True,)), ('trunc', (), NO_ARGS, 'scalar', (True,)), ('floor', (S, S, S), NO_ARGS, '', (True,)), From 36ec8f8fb8d0356db7cb67230500f70833a2b2ba Mon Sep 17 00:00:00 2001 From: Brandon Lin Date: Tue, 22 Sep 2020 08:22:58 -0700 Subject: [PATCH 007/449] [dper3] Create dper LearningRate low-level module (#44639) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44639 As title; this will unblock migration of several modules that need learning rate functionality. Test Plan: ``` buck test //dper3/dper3/modules/low_level_modules/tests:learning_rate_test ``` Reviewed By: yf225 Differential Revision: D23681733 fbshipit-source-id: 1d98cb35bf6a4ff0718c9cb6abf22401980b523c --- caffe2/sgd/learning_rate_op.cc | 5 ++-- caffe2/sgd/learning_rate_op.h | 10 ++++---- .../check_backward_compatibility.py | 24 ++++++++++--------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc index 534f89d68360..e8172ab65efe 100644 --- a/caffe2/sgd/learning_rate_op.cc +++ b/caffe2/sgd/learning_rate_op.cc @@ -164,7 +164,7 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU( "int? max_iter = -1, " "int? num_iter = 0, " "float? start_multiplier = 0, " - "float? end_mulitplier = 0, " + "float? end_multiplier = 0, " "float? multiplier = 0.5, " "float? multiplier_1 = 1.0, " "float? multiplier_2 = 1.0, " @@ -184,5 +184,6 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU( "float? cosine_max_lr = 0.05, " "int? cosine_period = 50, " "float? cosine_t_mult = 1.0, " - "float? cosine_lr_shrink = 0.99) -> Tensor output", + "float? cosine_lr_shrink = 0.99, " + "float? decay = 1.0) -> Tensor output", LearningRateOpFloatCPU); diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h index fa35ad4c8d6f..3ba6bef39e63 100644 --- a/caffe2/sgd/learning_rate_op.h +++ b/caffe2/sgd/learning_rate_op.h @@ -81,13 +81,13 @@ class LearningRateOp final : public Operator { return new HillLearningRate( num_iter, start_multiplier, gamma, power, end_multiplier); } else if (policy == "slope") { - int64_t num_iter_1 = - this->template GetSingleArgument(arg_prefix + "num_iter_1", 0); + int64_t num_iter_1 = this->template GetSingleArgument( + arg_prefix + "num_iter_1", 0); DCHECK_GT(num_iter_1, 0); T multiplier_1 = this->template GetSingleArgument( arg_prefix + "multiplier_1", 0.); - int64_t num_iter_2 = - this->template GetSingleArgument(arg_prefix + "num_iter_2", 0); + int64_t num_iter_2 = this->template GetSingleArgument( + arg_prefix + "num_iter_2", 0); DCHECK_GT(num_iter_1, 0); T multiplier_2 = this->template GetSingleArgument( arg_prefix + "multiplier_2", 0.); @@ -191,7 +191,7 @@ class LearningRateOp final : public Operator { int stepsize = this->template GetSingleArgument(arg_prefix + "stepsize", 0); T decay = - this->template GetSingleArgument(arg_prefix + "decay", 1.0); + this->template GetSingleArgument(arg_prefix + "decay", 1.0); DCHECK_GT(stepsize, 0); DCHECK_GE(max_lr, base_lr_); return new CyclicalLearningRate(base_lr_, max_lr, stepsize, decay); diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index d5cbe5a884a9..739a4de51951 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -1,4 +1,3 @@ - import argparse import datetime import re @@ -58,16 +57,16 @@ ("aten::atan2", datetime.date(2020, 7, 30)), ("aten::copy_", datetime.date(2020, 7, 30)), ("aten::sort", datetime.date(2020, 7, 30)), - ('aten::_convolution', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose', datetime.date(2020, 10, 15)), - ('aten::_convolution_double_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward_input', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward_weight', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward_input', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward_weight', datetime.date(2020, 10, 15)), + ("aten::_convolution", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose", datetime.date(2020, 10, 15)), + ("aten::_convolution_double_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward_input", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward_weight", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward_input", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward_weight", datetime.date(2020, 10, 15)), ("aten::_cudnn_init_dropout_state", datetime.date(2020, 7, 30)), ("aten::sparse_coo_tensor", datetime.date(2020, 7, 30)), ("aten::_sparse_coo_tensor_with_dims", datetime.date(2020, 7, 30)), @@ -90,6 +89,7 @@ ("aten::logspace", datetime.date(2020, 9, 30)), ("aten::logspace.out", datetime.date(2020, 9, 30)), ("__getstate__", datetime.date(2020, 9, 11), "Conv[23]dPackedParams"), + ("_caffe2::LearningRate", datetime.date(2020, 10, 1)), ("aten::_var", datetime.date(2020, 10, 1)), ("aten::_std", datetime.date(2020, 10, 1)), ("aten::_foreach_add_", datetime.date(2020, 10, 1)), @@ -115,6 +115,7 @@ def allow_listed(schema, allow_list): return True return False + # The nightly will fail to parse newly added syntax to schema declarations # Add new schemas that will fail the nightly here dont_parse_list = [ @@ -122,6 +123,7 @@ def allow_listed(schema, allow_list): ("test_backend", datetime.date(2099, 9, 17)), ] + def dont_parse(schema_line): for item in dont_parse_list: if item[1] < datetime.date.today(): From 4a0aa69a66cd7dac8dfba2268163c5b5dec899f4 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Tue, 22 Sep 2020 08:50:55 -0700 Subject: [PATCH 008/449] Fix undefined variable 'namedshape' in tensor.py (#45085) Summary: Hot Fix Pull Request resolved: https://github.com/pytorch/pytorch/pull/45085 Reviewed By: malfet, seemethere Differential Revision: D23824444 Pulled By: walterddr fbshipit-source-id: c9f37b394d281b7ef44b14c30699bb7510a362a7 --- torch/tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/tensor.py b/torch/tensor.py index be79dd5c3cd8..18dccfda7c8b 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -845,7 +845,7 @@ def unflatten(self, dim, sizes): relevant_args = (self,) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and has_torch_function(relevant_args): - return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, namedshape) + return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, sizes) if not sizes: raise RuntimeError("unflatten: sizes must be non-empty") From e155fbe915ff4553d0c0f81df728d606498fee15 Mon Sep 17 00:00:00 2001 From: albanD Date: Tue, 22 Sep 2020 08:51:58 -0700 Subject: [PATCH 009/449] add warning when ParameterList/Dict is used with DataParallel (#44405) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44405 Test Plan: Imported from OSS Reviewed By: agolynski Differential Revision: D23783987 Pulled By: albanD fbshipit-source-id: 5018b0d381cb09301d2f88a98a910854f740ace1 --- test/distributed/test_data_parallel.py | 30 ++++++++++++++++++++++++++ test/test_nn.py | 13 +++++++++++ torch/nn/modules/container.py | 24 +++++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py index dee5fd702b16..99a10906462a 100644 --- a/test/distributed/test_data_parallel.py +++ b/test/distributed/test_data_parallel.py @@ -775,6 +775,36 @@ def forward(self, x): print("Caught exception during iterations at " + named_msg, flush=True) raise + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + def test_parameter_list_dict_replica(self): + class MyMod(torch.nn.Module): + def __init__(self, data): + super(MyMod, self).__init__() + self.data = data + + def forward(self, inp): + return inp + + p1 = torch.nn.Parameter(torch.rand(10)) + p2 = torch.nn.Parameter(torch.rand(10)) + module = MyMod(torch.nn.ParameterList([p1, p2])).cuda() + model = dp.DataParallel(module) + input = torch.randn((8, 8), device="cuda") + + with self.assertWarnsRegex( + UserWarning, + r"nn\.ParameterList is being used with DataParallel but this"): + model(input) + + module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2})).cuda() + model = dp.DataParallel(module) + input = torch.randn((8, 8), device="cuda") + + with self.assertWarnsRegex( + UserWarning, + r"nn\.ParameterDict is being used with DataParallel but this"): + model(input) + if __name__ == '__main__': run_tests() diff --git a/test/test_nn.py b/test/test_nn.py index 7c8f6b7b2874..2dde3c46b74e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -2646,6 +2646,19 @@ def test_weight_norm(self): m = torch.nn.utils.weight_norm(m) m = torch.nn.utils.weight_norm(m) + def test_parameterlistdict_setting_attributes(self): + mod = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)])) + + with self.assertWarnsRegex(UserWarning, + r"Setting attributes on ParameterList is not supported"): + torch.nn.utils.weight_norm(mod, "0") + + mod = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))}) + + with self.assertWarnsRegex(UserWarning, + r"Setting attributes on ParameterDict is not supported"): + torch.nn.utils.weight_norm(mod, "b") + def test_weight_norm_pickle(self): m = torch.nn.utils.weight_norm(nn.Linear(5, 7)) m = pickle.loads(pickle.dumps(m)) diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index c9db80d64fdb..f5d07ae4a69c 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -429,6 +429,11 @@ def __setitem__(self, idx: int, param: 'Parameter') -> None: idx = self._get_abs_string_index(idx) return self.register_parameter(str(idx), param) + def __setattr__(self, key: Any, value: Any) -> None: + if not isinstance(value, torch.nn.Parameter): + warnings.warn("Setting attributes on ParameterList is not supported.") + super(ParameterList, self).__setattr__(key, value) + def __len__(self) -> int: return len(self._parameters) @@ -480,6 +485,13 @@ def extra_repr(self) -> str: def __call__(self, input): raise RuntimeError('ParameterList should not be called.') + def _replicate_for_data_parallel(self): + warnings.warn("nn.ParameterList is being used with DataParallel but this is not " + "supported. This list will appear empty for the models replicated " + "on each GPU except the original one.") + + return super(ParameterList, self)._replicate_for_data_parallel() + class ParameterDict(Module): r"""Holds parameters in a dictionary. @@ -533,6 +545,11 @@ def __setitem__(self, key: str, parameter: 'Parameter') -> None: def __delitem__(self, key: str) -> None: del self._parameters[key] + def __setattr__(self, key: Any, value: Any) -> None: + if not isinstance(value, torch.nn.Parameter): + warnings.warn("Setting attributes on ParameterDict is not supported.") + super(ParameterDict, self).__setattr__(key, value) + def __len__(self) -> int: return len(self._parameters) @@ -621,3 +638,10 @@ def extra_repr(self) -> str: def __call__(self, input): raise RuntimeError('ParameterDict should not be called.') + + def _replicate_for_data_parallel(self): + warnings.warn("nn.ParameterDict is being used with DataParallel but this is not " + "supported. This dict will appear empty for the models replicated " + "on each GPU except the original one.") + + return super(ParameterDict, self)._replicate_for_data_parallel() From 63fd257879db488e693b3b84ac4311a152df7497 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 22 Sep 2020 08:58:17 -0700 Subject: [PATCH 010/449] Add `Ellipsis` constant to the list of recognized tokens (#44959) Summary: Per https://docs.python.org/3.6/library/constants.html > `Ellipsis` is the same as ellipsis literal `...` Pull Request resolved: https://github.com/pytorch/pytorch/pull/44959 Reviewed By: suo Differential Revision: D23785660 Pulled By: malfet fbshipit-source-id: f68461849e7d16ef68042eb96566f2c936c06b0f --- torch/csrc/jit/frontend/lexer.h | 3 ++- torch/csrc/jit/frontend/parser.cpp | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h index f78dd7a7d11b..3a83d8b9a87f 100644 --- a/torch/csrc/jit/frontend/lexer.h +++ b/torch/csrc/jit/frontend/lexer.h @@ -111,7 +111,8 @@ namespace jit { _(TK_WITH, "with", "with") \ _(TK_WITH_ITEM, "withitem", "") \ _(TK_AS, "as", "as") \ - _(TK_PROP, "property", "") + _(TK_PROP, "property", "") \ + _(TK_ELLIPSIS, "Ellipsis", "Ellipsis") enum TokenKind { // we use characters to represent themselves so skip all valid characters diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp index 66c75d8a499d..c9f4aac038cc 100644 --- a/torch/csrc/jit/frontend/parser.cpp +++ b/torch/csrc/jit/frontend/parser.cpp @@ -167,6 +167,10 @@ struct ParserImpl { prefix = Dots::create(L.cur().range); L.next(); } break; + case TK_ELLIPSIS: { + prefix = Dots::create(L.cur().range); + L.next(); + } break; default: { Ident name = parseIdent(); prefix = Var::create(name.range(), name); From 9fc7a942f0043a79ccd0ef0c3d55a844249b52d3 Mon Sep 17 00:00:00 2001 From: Himangshu Date: Tue, 22 Sep 2020 09:05:41 -0700 Subject: [PATCH 011/449] Change from self to self.class() in _DecoratorManager to ensure a new object is every time a function is called recursively (#44633) Summary: Change from self to self._class_() in _DecoratorManager to ensure a new object is every time a function is called recursively Fixes https://github.com/pytorch/pytorch/issues/44531 Pull Request resolved: https://github.com/pytorch/pytorch/pull/44633 Reviewed By: agolynski Differential Revision: D23783601 Pulled By: albanD fbshipit-source-id: a818664dee7bdb061a40ede27ef99e9546fc80bb --- test/test_autograd.py | 47 +++++++++++++++++++++++++++++++++++++ torch/autograd/grad_mode.py | 4 ++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 938a41c2c089..c03c1a496605 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -1001,6 +1001,53 @@ def gen_enable_grad(): for _ in gen_enable_grad(): self.assertEqual(torch.is_grad_enabled(), False) + def test_set_grad_generator_functions_recursive(self): + # enable_grad_decorator_recursive and no_grad_decorator_recursive call each other + # recursively, to ensure that the decorators preserve the caller's setting + @torch.enable_grad() + def enable_grad_decorator_recursive(depth): + self.assertTrue(torch.is_grad_enabled()) + if depth > 0: + no_grad_decorator_recursive(depth - 1) + self.assertTrue(torch.is_grad_enabled()) + + @torch.no_grad() + def no_grad_decorator_recursive(depth): + self.assertFalse(torch.is_grad_enabled()) + if depth > 0: + enable_grad_decorator_recursive(depth - 1) + self.assertFalse(torch.is_grad_enabled()) + + # enable_grad_context_manager_recursive and no_grad_context_manager_recursive call + # each other recursively, to ensure that the decorators preserve the caller's setting + def enable_grad_context_manager_recursive(depth): + with torch.enable_grad(): + self.assertTrue(torch.is_grad_enabled()) + if depth > 0: + no_grad_context_manager_recursive(depth - 1) + self.assertTrue(torch.is_grad_enabled()) + + def no_grad_context_manager_recursive(depth): + with torch.no_grad(): + self.assertFalse(torch.is_grad_enabled()) + if depth > 0: + enable_grad_context_manager_recursive(depth - 1) + self.assertFalse(torch.is_grad_enabled()) + + with torch.enable_grad(): + self.assertTrue(torch.is_grad_enabled()) + enable_grad_decorator_recursive(10) + self.assertTrue(torch.is_grad_enabled()) + enable_grad_context_manager_recursive(10) + self.assertTrue(torch.is_grad_enabled()) + + with torch.no_grad(): + self.assertFalse(torch.is_grad_enabled()) + enable_grad_decorator_recursive(10) + self.assertFalse(torch.is_grad_enabled()) + enable_grad_context_manager_recursive(10) + self.assertFalse(torch.is_grad_enabled()) + def test_no_grad_python_function(self): """Python Functions should respect grad mode.""" x = torch.ones(5, 5, requires_grad=True) diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py index 4bcc3be1d85b..bbd96e941a54 100644 --- a/torch/autograd/grad_mode.py +++ b/torch/autograd/grad_mode.py @@ -22,7 +22,7 @@ def __call__(self, func: F) -> F: @functools.wraps(func) def decorate_context(*args, **kwargs): - with self: + with self.__class__(): return func(*args, **kwargs) return cast(F, decorate_context) @@ -33,7 +33,7 @@ def generator_context(*args, **kwargs): gen = func(*args, **kwargs) while True: try: - with self: + with self.__class__(): x = next(gen) yield x except StopIteration: From ae286d81e00f45b81778635c1aa482d64f2ec7bc Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Tue, 22 Sep 2020 09:37:00 -0700 Subject: [PATCH 012/449] [JIT] improve alias analysis for list constructs (#39111) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/39111 In our present alias analysis, we consider any Value that enter another container as entering the heap, and thus aliasing all other heap values of the same type. There are a number of advantages to this approach: - it is not to hard to maintain the aliasDb implementation - it is much easier from an op schema perspective - there are many composite list ops registered internally and externally that would be tricky to register and get right if we did something more complicated - It limits the size of the AliasDb, because a container of size 10 only contains a single memory dag element instead of 10 elements. The downside is that we have are unable to handle the simple and extremely common case of a list of tensors being used in an ATen op. In an example like: ``` def foo(input): x = torch.tensor([1, 2, 3, 4]) y = [x, x] input.add_(1) return torch.cat(y) ``` we will consider x to be written to. any write to any wildcard element (an element that enters a tuple, an element that is taken from a list) will mark x as written to. This can be limiting for our ability to create a functional subset and fuse graphs - as a result, 4 of TorchVision classification models could not be functionalized. Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D23828003 Pulled By: eellison fbshipit-source-id: 9109fcb6f2ca20ca897cae71683530285da9d537 --- test/cpp/jit/test_alias_analysis.cpp | 26 ++++++++++ test/jit/test_remove_mutation.py | 41 ++++++++++++++++ torch/csrc/jit/ir/alias_analysis.cpp | 72 ++++++++++++++++++++++++---- torch/csrc/jit/ir/alias_analysis.h | 5 +- 4 files changed, 133 insertions(+), 11 deletions(-) diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp index e854113a7a87..e700ee540616 100644 --- a/test/cpp/jit/test_alias_analysis.cpp +++ b/test/cpp/jit/test_alias_analysis.cpp @@ -1238,6 +1238,32 @@ TEST(AliasRegistrationTest, PureWithAnnotationsShouldError) { "Tried to register operator foo::rand11(Tensor(a) arg1) -> (Tensor(a)) with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA"); } +TEST(AliasRegistrationTest, AliasMoveAtenListOp) { + auto graph = std::make_shared(); + std::unordered_map vmap; + auto graph_string = R"IR( + graph(): + %x : Tensor = prim::MakeTestTensor() + %8 : int = prim::Constant[value=0]() + %5 : int = prim::Constant[value=1]() + %4 : int = prim::Constant[value=2]() + %y : Tensor[] = prim::ListConstruct(%x) + %6 : Tensor = aten::add_(%x, %4, %5) + %9 : Tensor = aten::cat(%y, %8) + return (%9))IR"; + + torch::jit::parseIR(graph_string, graph.get(), vmap); + AliasDb aliasDb(graph); + + // bc y.1 has a single used in a single non-aliasing aten op, + // x is added to y.1 contained elements instead of wildcard set + EXPECT_TRUE(!aliasDb.mayAlias(vmap["x"], vmap["9"])); + + // write to contained element should prevent move + EXPECT_TRUE(!aliasDb.moveBeforeTopologicallyValid( + vmap["y"]->node(), vmap["9"]->node())); +} + TEST(AliasRegistrationTest, PureWithAnnotationsShouldError2) { auto registry = torch::RegisterOperators().op( "foo::rand12(Tensor(a) arg1) -> Tensor(b)", diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py index ef408e775c33..b747fc06bcde 100644 --- a/test/jit/test_remove_mutation.py +++ b/test/jit/test_remove_mutation.py @@ -200,3 +200,44 @@ def intermediary_use(): # it is possible to remove the append here but don't currently have the logic for it FileCheck().check_not("append").run(graph) self.assertEqual(intermediary_use(), fn()) + + def test_common_pytorch_list_ops(self): + for op in ["cat", "stack", "vstack", "hstack", "dstack"]: + class OpMod(torch.nn.Module): + def __init__(self, op): + super(OpMod, self).__init__() + self.op = torch_op + + def forward(self): + x = torch.tensor([1, 2, 3, 4]) + x.add_(3) + y = [x, x] + return self.op(y) + 3 + + torch_op = getattr(torch, op) + mod = OpMod(torch_op) + mod_script = torch.jit.script(mod) + self.run_pass('remove_mutation', mod_script.forward.graph) + FileCheck().check_not("aten::add_").run(mod_script.forward.graph) + self.assertEqual(mod(), mod_script()) + + # test that the output doesnt alias the input + for inputs in [torch.rand(2, 2)], [torch.rand(2, 2) for _ in range(2)]: + result = torch_op(inputs) + sums = [ten.sum() for ten in result] + + for inp in inputs: + inp.fill_(10) + + self.assertEqual(sums, [ten.sum() for ten in result]) + + + @torch.jit.script + def test_multiple_uses(): + x = torch.tensor([1, 2, 3, 4]) + x.add_(3) + y = [x, x] + return torch.cat(y), y + + self.run_pass('remove_mutation', mod_script.forward.graph) + FileCheck().check("aten::add_").run(test_multiple_uses.graph) diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index 50b84d8f6405..bb5872f35f4f 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -298,15 +299,10 @@ void AliasDb::getReadsImpl(Node* n, MemoryLocations& ret) const { auto it = elementMap_.find(input); if (it != elementMap_.end()) { auto el = it->second; - // Add all memory locations this element may alias. - ret |= memoryDAG_->getMemoryLocations(el); - // We also consider memory locations of contained values to be "read". - for (const auto& type : input->type()->containedTypes()) { - if (auto wildcard = getWildcard(type)) { - ret |= memoryDAG_->getMemoryLocations(wildcard); - } - } + // Add all memory locations this element may alias and their contained + // elements + memoryDAG_->collectAllContainedMemoryLocations(el, ret); } } @@ -878,6 +874,44 @@ void AliasDb::analyzeConservative(Node* node) { } } +bool AliasDb::functionalNonEscapingListUse(const Use& use) const { + Node* n = use.user; + size_t offset = use.offset; + Value* container = n->inputs().at(offset); + + // only consider aten op uses of lists + if (!container->type()->cast()) { + return false; + } + + /* + in the general case, we consider any Value that enters another container as + entering the heap, and thus aliasing all other heap values of the same type. + the advantage of this approach are: + - there are many composite list/container ops that would be tricky to + schematize if we did something more complicated + - limits the size of the AliasDb, because a container of size 10 only contains + 1 memory dag element instead of 10 + - we do not need to worry about adding contained elements to the wildcard set + when a container escapes the graph. + The downside of this approach is we are unable to handle the common case of a + list constructed and passed into an aten op. Here, optimize for a set of + common ops where the output does not alias the list or the list elements + */ + + switch (use.user->kind()) { + case aten::cat: + case aten::broadcast_tensors: + case aten::stack: + case aten::vstack: + case aten::hstack: + case aten::dstack: + return true; + } + + return false; +} + // List or dict or tuple: construct: create an aliasing element for the actual // container, then mark all inputs as wildcards, since they've gone inside the // container. Then, add the wildcard sets of appropriate type to the contained @@ -895,6 +929,20 @@ void AliasDb::analyzeContainerConstruct(Node* node) { TORCH_INTERNAL_ASSERT(node->outputs().size() == 1); auto container = node->output(); + + // optimization: + // if a list is only used once in an aten op, and the op output + // doesn't alias the input, then we can add all inputs to the list's + // contained elements instead of the wildcard set. + if (container->uses().size() == 1 && + functionalNonEscapingListUse(container->uses().at(0))) { + giveFreshAlias(container, false); + for (Value* v : node->inputs()) { + addToContainedElements(v, container); + } + return; + } + giveFreshAlias(container); auto container_elem = elementMap_.at(container); for (auto input : node->inputs()) { @@ -1068,7 +1116,9 @@ void AliasDb::createValue(const Value* value) { elementMap_[value] = new_elem; } -void AliasDb::giveFreshAlias(const Value* value) { +void AliasDb::giveFreshAlias( + const Value* value, + bool add_wildcard_to_contained_elems) { auto maybe_mut_type = getMutableTypePtr(value->type()); if (!maybe_mut_type) { return; @@ -1082,7 +1132,9 @@ void AliasDb::giveFreshAlias(const Value* value) { auto new_elem = memoryDAGBuilder_->makeFreshValue(value); elementMap_[value] = new_elem; - addContainedTypesToFreshElement(new_elem, *maybe_mut_type); + if (add_wildcard_to_contained_elems) { + addContainedTypesToFreshElement(new_elem, *maybe_mut_type); + } } Element* AliasDb::getOrCreateElement(const Value* value) { diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h index e3e69185891f..b20654b1f6b9 100644 --- a/torch/csrc/jit/ir/alias_analysis.h +++ b/torch/csrc/jit/ir/alias_analysis.h @@ -205,10 +205,13 @@ class AliasDb { const Value* element, const Value* container); void mapAliases(at::ArrayRef to, at::ArrayRef from); - void giveFreshAlias(const Value* value); + void giveFreshAlias( + const Value* value, + bool add_wildcard_to_contained_elems = true); Element* getOrCreateElement(const Value* value); c10::optional getMutableTypePtr(const TypePtr& type) const; + bool functionalNonEscapingListUse(const Use& use) const; bool isContainerType(const TypePtr& type) const; From 4b42f0b6134977bf3ae1b3466ed4674ada9fe372 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Tue, 22 Sep 2020 09:57:24 -0700 Subject: [PATCH 013/449] Support Math keyword in native_functions.yaml. (#44556) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44556 Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23698386 Pulled By: ailzhang fbshipit-source-id: f10ea839a2cfe7d16f5823a75b8b8c5f1ae22dde --- aten/src/ATen/native/README.md | 12 +++++++ aten/src/ATen/native/group_norm.cpp | 24 +++++++++++++ aten/src/ATen/native/native_functions.yaml | 1 + aten/src/ATen/templates/TypeDefault.cpp | 4 +++ aten/src/ATen/test/CMakeLists.txt | 1 + aten/src/ATen/test/math_kernel_test.cpp | 40 ++++++++++++++++++++++ tools/autograd/gen_variable_type.py | 7 ++-- tools/codegen/gen.py | 21 ++++++++++-- 8 files changed, 104 insertions(+), 6 deletions(-) create mode 100644 aten/src/ATen/test/math_kernel_test.cpp diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 861901521a3b..f18114e73246 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -277,6 +277,18 @@ them the same thing!) If two backends have the same dispatch function, you can write `CPU, CUDA: func` to reuse the same function name in both cases. +Available backend options can be found at +https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py#L970. +In addition to backends above, we also support keyword `Math` which is an alias +that maps to all backend and autograd backend keys. In other words, function registered to `Math` key +should be a plain mathematical composition of other `at::` functions and works for any backend. + +If you add `dispatch` section to any API that didn't have it before, you **have to** move +the old implementation to `Math` field so that it's still available for other backends to use. + +This work is currently WIP and you can find the design proposal in +https://github.com/pytorch/pytorch/issues/44680. + ### `device_guard` ``` diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 229e54a9ce62..beb4d940363e 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -133,5 +133,29 @@ Tensor group_norm( DEFINE_DISPATCH(GroupNormKernel); DEFINE_DISPATCH(GroupNormBackwardKernel); +std::tuple math_group_norm( + const at::Tensor& input, const at::Tensor& weight, + const at::Tensor& bias, int64_t N, int64_t C, int64_t HxW, + int64_t group, double eps) { + auto input_shape = input.sizes(); + at::Tensor input_reshaped = input.view({1, N * group, N ? -1 : 1}); + auto outputs = at::native_batch_norm( + input_reshaped, /*weight=*/{}, /*bias=*/{}, /*running_mean=*/{}, + /*running_var=*/{}, /*training=*/true, /*momentum=*/0, eps); + at::Tensor out = std::get<0>(outputs); + out = out.view(input_shape); + std::vector affine_param_shape(input.dim(), 1); + affine_param_shape[1] = C; + if (weight.defined() && bias.defined()) { + out = bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1); + } else if (weight.defined()) { + out = out.mul(weight.view(affine_param_shape)); + } else if (bias.defined()) { + out = out.add(bias.view(affine_param_shape)); + } + at::Tensor mean = std::get<1>(outputs).view({N, group}); + at::Tensor rstd = std::get<2>(outputs).view({N, group}); + return std::make_tuple(out, mean, rstd); +} } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 3244522f1808..d5a746e2a522 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1731,6 +1731,7 @@ use_c10_dispatcher: full dispatch: CPU, CUDA: native_group_norm + Math: math_group_norm - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index aa5bb4f0c838..6f2b988619c7 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -73,4 +73,8 @@ TORCH_LIBRARY(aten, m) { m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)"); } +TORCH_LIBRARY_IMPL(aten, Math, m) { + ${math_function_registrations}; +} + } // namespace at diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index a0b992302084..43d0fc8ccd92 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -27,6 +27,7 @@ list(APPEND ATen_CPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp new file mode 100644 index 000000000000..9a4dfd640c3e --- /dev/null +++ b/aten/src/ATen/test/math_kernel_test.cpp @@ -0,0 +1,40 @@ +#include + +#include + +using namespace at; + +#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ + ASSERT_TRUE(t1.is_same_size(t2)); \ + ASSERT_TRUE(t1.allclose(t2, atol, rtol)); + +// Ideally we want to test both forward and backward on math kernels but I +// haven't found an easy way to do it. Currently we only test forward here +// and rely on backward tests of each at:: function used in math kernels. +TEST(MathKernelTest, NativeGroupNorm) { + int num_channels = 6; + int N = 2; + int H = 2, W = 2; + int HxW = H * W; + + const auto input = randn({N, num_channels, H, W}); + const auto weight = randn({num_channels}); + const auto bias = randn({num_channels}); + double eps = 1e-05; + for (bool undef_weight: {true, false}) { + for (int num_groups: {3, 6, 1}) { + Tensor undef; + auto out = at::native::native_group_norm( + input, undef_weight ? undef : weight, undef_weight ? undef : bias, + N, num_channels, HxW, num_groups, eps); + auto math_out = at::native::math_group_norm( + input, undef_weight ? undef : weight, undef_weight ? undef : bias, + N, num_channels, HxW, num_groups, eps); + ASSERT_ALLCLOSE_TOLERANCES(std::get<0>(out), std::get<0>(math_out), 1e-4, 1e-6); + ASSERT_ALLCLOSE_TOLERANCES(std::get<1>(out), std::get<1>(math_out), 1e-4, 1e-6); + ASSERT_ALLCLOSE_TOLERANCES(std::get<2>(out), std::get<2>(math_out), 1e-4, 1e-6); + } + } +} + + diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 804da9193a50..e41c921f1e33 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -370,7 +370,8 @@ # Generate a file that lists all functions and their schema string. Used for XLA REGISTRATION_DECLARATION = CodeTemplate("""\ -${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"} +${return_type} ${api_name}(${declaration_formals}); \ +// {"schema": "${schema_string}", "compound": "${compound}", "has_math_kernel": "${has_math_kernel}"} """) # TraceType templates @@ -654,12 +655,12 @@ def gen_variable_type(out, aten_declarations, template_path): registration_declarations.append( REGISTRATION_DECLARATION.substitute(declaration, declaration_formals=declaration_formals, - compound='false')) + compound='False')) else: registration_declarations.append( REGISTRATION_DECLARATION.substitute(declaration, declaration_formals=declaration_formals, - compound='true')) + compound='True')) env = { 'registration_declarations': registration_declarations, diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index e4acb369f08e..be8c57f1061a 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -290,7 +290,7 @@ def func(f: NativeFunction) -> Optional[str]: assert returns_type == dispatcher.returns_type(f.func.returns) dispatcher_args = dispatcher.arguments(f.func) dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args)) - if dispatch is None: + if dispatch is None or dispatch == 'Math': type_name = f'TypeDefault::{name}' else: type_name = f'{dispatch}Type::{name}' @@ -811,6 +811,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object: ('device_guard', f.device_guard), ('with_gil', False), ('deprecated', False), + ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch), ]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -1016,17 +1017,31 @@ def make_file_manager(install_dir: str) -> FileManager: del fm cpu_fm.write('TypeDefault.h', lambda: { - 'type_method_declarations': list(mapMaybe( + 'type_method_declarations': + list(mapMaybe( compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), + native_functions)) + + list(mapMaybe( + compute_type_method('Math', target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), native_functions)), + }) cpu_fm.write('TypeDefault.cpp', lambda: { - 'type_method_definitions': list(mapMaybe( + 'type_method_definitions': + list(mapMaybe( compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), + native_functions)) + + list(mapMaybe( + compute_type_method('Math', target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), native_functions)), + 'function_registrations': list(mapMaybe( compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), native_functions)) if not options.per_op_registration else [], + + 'math_function_registrations': list(mapMaybe( + compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), + native_functions)) if not options.per_op_registration else [], }) cpu_fm.write('Functions.h', lambda: { 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), From 8501b89a87398422025df32f50c0d3e1bbd152fb Mon Sep 17 00:00:00 2001 From: Negin Raoof Date: Tue, 22 Sep 2020 10:07:08 -0700 Subject: [PATCH 014/449] [ONNX] Update ort release (#45095) Summary: Update ort release Pull Request resolved: https://github.com/pytorch/pytorch/pull/45095 Reviewed By: bwasti Differential Revision: D23832041 Pulled By: malfet fbshipit-source-id: 39c47a87e451c4c43ba4d4e8be385cc195cc611a --- .jenkins/caffe2/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 20a7310a91c1..61fb7de08fe5 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # default pip version is too old(9.0.2), unable to support tag `manylinux2010`. # Fix the pip error: Couldn't find a version that satisfies the requirement pip install --upgrade pip - pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122 + pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182 fi "$ROOT_DIR/scripts/onnx/test.sh" fi From 1fd48a9d1f4158fb015793caa282d2c0ca92059d Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 22 Sep 2020 10:28:26 -0700 Subject: [PATCH 015/449] Revert D23798016: [FX] s/get_param/get_attr/ Test Plan: revert-hammer Differential Revision: D23798016 (https://github.com/pytorch/pytorch/commit/c941dd3492535b3e09f4cb3f60c80b02f5e04c3f) Original commit changeset: 1d2f3db1994a fbshipit-source-id: 974d930064b37d396c5d66c905a63d45449813e5 --- test/fx/quantization.py | 2 +- test/test_fx.py | 6 +++--- torch/fx/__init__.py | 4 ++-- torch/fx/graph.py | 10 +++++----- torch/fx/graph_module.py | 4 ++-- torch/fx/symbolic_trace.py | 8 ++++---- torch/quantization/fx/quantize.py | 4 ++-- torch/quantization/fx/utils.py | 4 ++-- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/test/fx/quantization.py b/test/fx/quantization.py index 968c797c9163..8116ed5ce89a 100644 --- a/test/fx/quantization.py +++ b/test/fx/quantization.py @@ -222,7 +222,7 @@ def load_arg(a): for node in self.graph.nodes: if node.op == 'placeholder': result = next(args_iter) - elif node.op == 'get_attr': + elif node.op == 'get_param': result = self.state_dict[node.target] elif node.op == 'call_function': result = node.target(*load_arg(node.args), **load_arg(node.kwargs)) diff --git a/test/test_fx.py b/test/test_fx.py index 41607d64cbcc..89311e2a2873 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -336,7 +336,7 @@ def __init__(self, interpreter): placeholder_nodes.append(graph.create_node('placeholder', name)) # Get the interpreter object - interpreter_node = graph.create_node('get_attr', 'interpreter') + interpreter_node = graph.create_node('get_param', 'interpreter') # Add a node to call the interpreter instance output_node = graph.create_node( @@ -567,7 +567,7 @@ def test_graph_fns(self): g = Graph() a = g.placeholder('a') b = g.call_module('linear', (a,)) - c = g.get_attr('bias') + c = g.get_param('bias') d = g.call_method('add', (b, c)) e = g.call_function(torch.sin, (d,)) g.output(e) @@ -584,7 +584,7 @@ def test_construct_root_dict(self): graph : torch.fx.Graph = torch.fx.Graph() a : torch.fx.Node = graph.create_node('placeholder', 'x') b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,)) - c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam') + c : torch.fx.Node = graph.create_node('get_param', 'zip.zap.zam') d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c)) graph.output(d) diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py index 185511460740..5b90c434340c 100644 --- a/torch/fx/__init__.py +++ b/torch/fx/__init__.py @@ -36,7 +36,7 @@ def forward(self, x): opcode name target args kwargs ------------- ------------- ------------------------------------------------------- ------------------ ----------- placeholder x x () {} -get_attr linear_weight linear.weight () {} +get_param linear_weight linear.weight () {} call_function add_1 (x, linear_weight) {} call_module linear_1 linear (add_1,) {} call_method relu_2 relu [linear_1] {} @@ -48,7 +48,7 @@ def forward(self, x): - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on. `target` is similarly the name of the argument. `args` and `kwargs` are don't-care -- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the +- `get_param` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy. `args` and `kwargs` are don't-care - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 1a8079ca8289..a63b7c8b35dc 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -80,7 +80,7 @@ def create_node(self, op: str, target: Target, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, name: Optional[str] = None) -> Node: - assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder') + assert op in ('call_function', 'call_method', 'get_param', 'call_module', 'placeholder') args = () if args is None else args kwargs = {} if kwargs is None else kwargs self._mark_uses(args) @@ -93,8 +93,8 @@ def create_node(self, op: str, target: Target, def placeholder(self, name: str) -> Node: return self.create_node('placeholder', name) - def get_attr(self, name: str) -> Node: - return self.create_node('get_attr', name) + def get_param(self, name: str) -> Node: + return self.create_node('get_param', name) def call_module(self, module_name: str, @@ -196,7 +196,7 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]: assert isinstance(node.target, str) body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})\n') continue - elif node.op == 'get_attr': + elif node.op == 'get_param': assert isinstance(node.target, str) body.append(f'{node.name} = {_format_target(root_module, node.target)}\n') continue @@ -230,7 +230,7 @@ def format_node(n : Node) -> Optional[str]: assert isinstance(n.target, str) placeholder_names.append(n.target) return None - elif n.op == 'get_attr': + elif n.op == 'get_param': return f'%{n.name} : [uses={n.uses}] = self.{n.target}' else: return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \ diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index 505ee991d6cc..df40cbd84fe1 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -124,13 +124,13 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph): if hasattr(root, 'training'): self.training = root.training for node in graph.nodes: - if node.op in ['get_attr', 'call_module']: + if node.op in ['get_param', 'call_module']: assert isinstance(node.target, str) _copy_attr(root, self, node.target) elif isinstance(root, dict): targets_to_copy = [] for node in graph.nodes: - if node.op in ['get_attr', 'call_module']: + if node.op in ['get_param', 'call_module']: assert isinstance(node.target, str) if node.target not in root: raise RuntimeError('Node ' + str(node) + ' referenced target ' + node.target + diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py index 9b192dd5501f..442fa28c36d9 100644 --- a/torch/fx/symbolic_trace.py +++ b/torch/fx/symbolic_trace.py @@ -55,15 +55,15 @@ def create_arg(self, a: Any) -> Argument: if isinstance(a, torch.nn.Parameter): for n, p in self.root.named_parameters(): if a is p: - return self.create_node('get_attr', n, (), {}) + return self.create_node('get_param', n, (), {}) raise NameError('parameter is not a member of this module') # Tensors do not have a reliable string repr() from which they can be # constructed (and we probably don't want to rely on that, either), so # for any constant Tensor values we encounter, first search for if they # are an attribute of some module in the module hierarchy. If so, emit - # a get_attr to retrieve that tensor. Otherwise, we'll store away the + # a get_param to retrieve that tensor. Otherwise, we'll store away the # tensor value into a special attribute on the Module s.t. we can - # retrieve it with a get_attr. + # retrieve it with a get_param. if isinstance(a, torch.Tensor): # TODO: slow def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]: @@ -96,7 +96,7 @@ def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]: i += 1 setattr(self.root, qualname, a) - return self.create_node('get_attr', qualname, (), {}) + return self.create_node('get_param', qualname, (), {}) return super().create_arg(a) def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool: diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 7967b4ec2dcb..8d8ef0f328c3 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -177,7 +177,7 @@ def get_qconfig(module): self.qconfig_map = dict() for node in input_graph.nodes: - if node.op == 'get_attr': + if node.op == 'get_param': parent, _ = _parent_name(node.target) self.qconfig_map[node.name] = get_qconfig(self.modules[parent]) elif node.op == 'call_function': @@ -557,7 +557,7 @@ def load_arg(a): setattr(quantized_root, packed_weight_name, packed_weight) # replace prepack node with a getattr node env[node.name] = folded_graph.create_node( - 'get_attr', packed_weight_name, (), {}) + 'get_param', packed_weight_name, (), {}) elif prepack_node is not None: # remove the foled node continue diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 5d5532dc48fc..95d19df1e1b4 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -17,7 +17,7 @@ def graph_pretty_str(g, shorten=True) -> str: built_in_meth_re = re.compile('') op_dict = { 'placeholder': 'plchdr', - 'get_attr': 'gt_prm', + 'get_param': 'gt_prm', 'call_function': 'cl_fun', 'call_module': 'cl_mod', 'call_method': 'cl_meth', @@ -136,5 +136,5 @@ def get_next_qparams_idx(module, qparams): for key, value in qparams.items(): setattr(root_module, key + str(idx), value) qparam_full_path = key + str(idx) - inputs.append(graph.create_node('get_attr', qparam_full_path)) + inputs.append(graph.create_node('get_param', qparam_full_path)) return graph.create_node('call_function', quantize_op, tuple(inputs), {}) From 10f287539f64431a41c3571b40b966c0a8e85e65 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Tue, 22 Sep 2020 10:48:06 -0700 Subject: [PATCH 016/449] Align casing in test_dispatch with dispatch keys. (#44933) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44933 Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D23778247 Pulled By: ailzhang fbshipit-source-id: bc3725eae670b03543015afe763cb3bb16baf8f6 --- test/test_dispatch.py | 89 +++++++++++++++++++--------- torch/csrc/utils/python_dispatch.cpp | 13 ++-- 2 files changed, 68 insertions(+), 34 deletions(-) diff --git a/test/test_dispatch.py b/test/test_dispatch.py index ec9fd20797e3..45480d8916f0 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -229,11 +229,11 @@ def test_def(self): # m.impl("test_def", [](const Tensor& x) { return x }) lambda m: m.impl_t_t("foo"), # m.impl("test_def", kCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="cpu"), + lambda m: m.impl_t_t("foo", dispatch="CPU"), # m.impl("test_def", kAutograd, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="autograd"), + lambda m: m.impl_t_t("foo", dispatch="Autograd"), # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="autogradcpu") + lambda m: m.impl_t_t("foo", dispatch="AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -262,11 +262,11 @@ def test_def_with_inference(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -296,11 +296,11 @@ def test_impl_only(self): # m.impl("foo", [](const Tensor& x) { return x }) lambda m: m.impl_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -316,13 +316,13 @@ def test_computed_table(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "xla", debug="fn_xla"), + lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu", debug="fn_autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu") ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -351,12 +351,12 @@ def test_computed_table(self): ''') def test_computed_table_with_cpu_catchall(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -382,12 +382,12 @@ def test_computed_table_with_cpu_catchall(self): ''') def test_computed_table_with_math(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math"), + lambda m: m.impl_t_t("foo", "Math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -412,14 +412,14 @@ def test_computed_table_with_math(self): ''') def test_computed_table_with_cpu_math(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math", debug="fn_math"), + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -445,12 +445,12 @@ def test_computed_table_with_cpu_math(self): ''') def test_computed_table_with_autograd(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -476,11 +476,11 @@ def test_computed_table_with_cpu_autograd_math_catchall(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math", debug="fn_math"), + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -512,9 +512,9 @@ def test_computed_table_with_cpu_autograd_catchall(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -538,6 +538,39 @@ def test_computed_table_with_cpu_autograd_catchall(self): AutogradCPU: fn_autograd [autograd kernel] AutogradCUDA: fn_autograd [autograd kernel] AutogradXLA: fn_autograd [autograd kernel] +''') + + def test_computed_table_with_ambiguous_autogradother(self): + result = self.commute("foo", [ + # m.def("foo", [](const Tensor & x) { return x }) + lambda m: m.def_name_t_t("foo"), + # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), + # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"), + ]) + state, table = result.state, result.table + self.assertExpectedInline(state, '''\ +name: test::foo +schema: test::foo(Tensor _0) -> (Tensor _0) +debug: registered at /dev/null:0 +alias analysis kind: CONSERVATIVE +QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +Math[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +catchall: default_def_name_t_t :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +''') + + # computed dispatch table is too big, so we only check on a few entries we're interested in. + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check) + + self.assertExpectedInline(extracted_table, '''\ +CPU: fn_math [math kernel] +CUDA: fn_math [math kernel] +XLA: fn_math [math kernel] +AutogradOther: ambiguous_autogradother [ambiguous autogradother] +AutogradCPU: fn_math [math kernel] +AutogradCUDA: fn_math [math kernel] +AutogradXLA: fn_math [math kernel] ''') # Can't do this yet for BC reasons @@ -631,7 +664,7 @@ def test_multiple_def_alias_mismatch(self): ) def test_multiple_fallback(self): - global_m = C._dispatch_library("IMPL", "_", "xla") + global_m = C._dispatch_library("IMPL", "_", "XLA") global_m.fallback_fallthrough(), try: global_m.fallback_fallthrough(), diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp index 21bf8e69adc4..f0f63bf7a2f0 100644 --- a/torch/csrc/utils/python_dispatch.cpp +++ b/torch/csrc/utils/python_dispatch.cpp @@ -27,12 +27,13 @@ torch::Library::Kind parseKind(const std::string& k) { c10::optional parseDispatchKey(const std::string& k) { static std::unordered_map key_map = { - {"cpu", c10::DispatchKey::CPU}, - {"cuda", c10::DispatchKey::CUDA}, - {"xla", c10::DispatchKey::XLA}, - {"math", c10::DispatchKey::Math}, - {"autograd", c10::DispatchKey::Autograd}, - {"autogradcpu", c10::DispatchKey::AutogradCPU}, + {"CPU", c10::DispatchKey::CPU}, + {"CUDA", c10::DispatchKey::CUDA}, + {"XLA", c10::DispatchKey::XLA}, + {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, + {"Math", c10::DispatchKey::Math}, + {"Autograd", c10::DispatchKey::Autograd}, + {"AutogradCPU", c10::DispatchKey::AutogradCPU}, {"", c10::DispatchKey::Undefined}, }; auto it = key_map.find(k); From ef885c10d8591a924bd889d0d8778f485dc10f42 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Tue, 22 Sep 2020 11:33:45 -0700 Subject: [PATCH 017/449] [pytorch] Add triplet margin loss with custom distance (#43680) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43680 As discussed [here](https://github.com/pytorch/pytorch/issues/43342), adding in a Python-only implementation of the triplet-margin loss that takes a custom distance function. Still discussing whether this is necessary to add to PyTorch Core. Test Plan: python test/run_tests.py Imported from OSS Reviewed By: albanD Differential Revision: D23363898 fbshipit-source-id: 1cafc05abecdbe7812b41deaa1e50ea11239d0cb --- docs/source/nn.functional.rst | 7 +- docs/source/nn.rst | 3 +- test/test_nn.py | 80 ++++++++++++++++++++++ torch/nn/functional.py | 36 ++++++++++ torch/nn/functional.pyi.in | 11 ++- torch/nn/modules/__init__.py | 6 +- torch/nn/modules/loss.py | 123 +++++++++++++++++++++++++++++++++- torch/overrides.py | 3 + 8 files changed, 258 insertions(+), 11 deletions(-) diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst index eb88b50e6d56..416121cec8d6 100644 --- a/docs/source/nn.functional.rst +++ b/docs/source/nn.functional.rst @@ -483,6 +483,11 @@ Loss functions .. autofunction:: triplet_margin_loss +:hidden:`triplet_margin_with_distance_loss` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: triplet_margin_with_distance_loss + Vision functions ---------------- @@ -533,5 +538,3 @@ DataParallel functions (multi-GPU, distributed) ~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: torch.nn.parallel.data_parallel - - diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 3a6cb7e19316..8d195c04037c 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -10,7 +10,7 @@ These are the basic building block for graphs :depth: 2 :local: :backlinks: top - + .. currentmodule:: torch.nn @@ -269,6 +269,7 @@ Loss Functions nn.CosineEmbeddingLoss nn.MultiMarginLoss nn.TripletMarginLoss + nn.TripletMarginWithDistanceLoss Vision Layers ---------------- diff --git a/test/test_nn.py b/test/test_nn.py index 2dde3c46b74e..00614c0cdc34 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9866,6 +9866,7 @@ def v(fn): v(lambda: F.multilabel_margin_loss(input, zeros, reduction=reduction)) v(lambda: F.triplet_margin_loss(input, input, input, reduction=reduction)) + v(lambda: F.triplet_margin_with_distance_loss(input, input, input, reduction=reduction)) v(lambda: F.margin_ranking_loss(input, input, input.sign(), reduction=reduction)) v(lambda: F.cosine_embedding_loss(input, input, input[:, 0].sign(), reduction=reduction)) @@ -12185,6 +12186,85 @@ def test_threshold_inplace_overlap(self, device): F.threshold(x, 0.5, 0.5, inplace=True) F.threshold_(x, 0.5, 0.5) + @onlyOnCPUAndCUDA + def test_triplet_margin_with_distance_loss_default_parity(self, device): + # Test for `nn.TripletMarginWithDistanceLoss` and + # `F.triplet_margin_with_distance_loss`. Checks + # for parity against the respective non-distance-agnostic + # implementations of triplet margin loss (``nn.TripletMarginLoss` + # and `F.triplet_margin_loss`) under *default args*. + + for extra_args in \ + itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')): + kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]} + + anchor = torch.randn(5, 10, device=device, requires_grad=True) + positive = torch.randn(5, 10, device=device, requires_grad=True) + negative = torch.randn(5, 10, device=device, requires_grad=True) + + # Test forward, functional + expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs) + actual = F.triplet_margin_with_distance_loss(anchor, positive, negative, **kwargs) + self.assertEqual(actual, expected, rtol=1e-6, atol=1e-6) + + # Test forward, module + loss_ref = nn.TripletMarginLoss(**kwargs) + loss_op = nn.TripletMarginWithDistanceLoss(**kwargs) + self.assertEqual(loss_op(anchor, positive, negative), + loss_ref(anchor, positive, negative), + rtol=1e-6, atol=1e-6) + + # Test backward + self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( + a, p, n, **kwargs), (anchor, positive, negative))) + self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n), + (anchor, positive, negative))) + + @onlyOnCPUAndCUDA + def test_triplet_margin_with_distance_loss(self, device): + # Test for parity between `nn.TripletMarginWithDistanceLoss` and + # `F.triplet_margin_with_distance_loss`. + + pairwise_distance = nn.PairwiseDistance() + + def cosine_distance(x, y): + return 1.0 - F.cosine_similarity(x, y) + + distance_functions = (pairwise_distance, cosine_distance, + lambda x, y: 1.0 - F.cosine_similarity(x, y)) + + reductions = ('mean', 'none', 'sum') + margins = (1.0, 1.5, 0.5) + swaps = (True, False) + + for distance_fn, reduction, margin, swap \ + in itertools.product(distance_functions, reductions, margins, swaps): + anchor = torch.randn(5, 10, device=device, requires_grad=True) + positive = torch.randn(5, 10, device=device, requires_grad=True) + negative = torch.randn(5, 10, device=device, requires_grad=True) + + # Test backward + self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( + a, p, n, distance_function=distance_fn, reduction=reduction, margin=margin, swap=swap), + (anchor, positive, negative))) + loss_op = nn.TripletMarginWithDistanceLoss(distance_function=distance_fn, + reduction=reduction, margin=margin, swap=swap) + self.assertTrue(gradcheck(lambda a, p, n: loss_op( + a, p, n), (anchor, positive, negative))) + traced_loss_op = torch.jit.trace(loss_op, (anchor, positive, negative)) + self.assertTrue(gradcheck(lambda a, p, n: traced_loss_op( + a, p, n), (anchor, positive, negative))) + + # Test forward parity + functional = F.triplet_margin_with_distance_loss(anchor, positive, negative, + distance_function=distance_fn, + reduction=reduction, margin=margin, swap=swap) + modular = loss_op(anchor, positive, negative) + traced = traced_loss_op(anchor, positive, negative) + self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6) + self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6) + + class TestModuleGlobalHooks(TestCase): def tearDown(self): diff --git a/torch/nn/functional.py b/torch/nn/functional.py index edde49a1d358..f4dbceeb88b1 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -3728,6 +3728,42 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, s swap, reduction_enum) +def triplet_margin_with_distance_loss(anchor, positive, negative, *, distance_function=None, + margin=1.0, swap=False, reduction="mean"): + # type: (Tensor, Tensor, Tensor, Optional[Callable[[Tensor, Tensor], Tensor]], float, bool, str) -> Tensor + r""" + See :class:`~torch.nn.TripletMarginWithDistanceLoss` for details. + """ + if torch.jit.is_scripting(): + raise NotImplementedError("F.triplet_margin_with_distance_loss does not support JIT scripting: " + "functions requiring Callables cannot be scripted.") + + tens_ops = (anchor, positive, negative) + if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops): + return handle_torch_function( + triplet_margin_with_distance_loss, tens_ops, anchor, positive, negative, + distance_function=distance_function, margin=margin, swap=swap, reduction=reduction) + + distance_function = distance_function if distance_function is not None else pairwise_distance + + positive_dist = distance_function(anchor, positive) + negative_dist = distance_function(anchor, negative) + + if swap: + swap_dist = distance_function(positive, negative) + negative_dist = torch.min(negative_dist, swap_dist) + + output = torch.clamp(positive_dist - negative_dist + margin, min=0.0) + + reduction_enum = _Reduction.get_enum(reduction) + if reduction_enum == 1: + return output.mean() + elif reduction_enum == 2: + return output.sum() + else: + return output + + def normalize(input, p=2, dim=1, eps=1e-12, out=None): # type: (Tensor, float, int, float, Optional[Tensor]) -> Tensor r"""Performs :math:`L_p` normalization of inputs over specified dimension. diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in index d7656b72425a..215fb0278dc6 100644 --- a/torch/nn/functional.pyi.in +++ b/torch/nn/functional.pyi.in @@ -22,9 +22,9 @@ GRID_SAMPLE_PADDING_MODES = Dict[str, int] # This was necessary since the JIT uses BroadcastingList* types but static checking with mypy etc requires a `Sequence` # type. There is no way to express the expected lengths of these lists in the current Python typing system. # -# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were -# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code -# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system +# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were +# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code +# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system # to encode the type semantics of `_add_docstr`, should that system ever become widespread. def fractional_max_pool2d_with_indices(input: Tensor, kernel_size: _size, output_size: Optional[_size] = ..., output_ratio: Optional[_ratio_any_t] = ..., return_indices: bool = ..., @@ -319,6 +319,11 @@ def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, marg reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ... +def triplet_margin_with_distance_loss(anchor: Tensor, positive: Tensor, negative: Tensor, *, + distance_function: Optional[Callable[[Tensor, Tensor], Tensor]]=..., + margin: float=..., swap: bool=..., reduction: str=...) -> Tensor: ... + + def normalize(input: Tensor, p: float = ..., dim: int = ..., eps: float = ..., out: Optional[Tensor] = ...) -> Tensor: ... diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index b5a03d4a049d..06a565700550 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -8,8 +8,8 @@ Hardsigmoid, Hardswish, SiLU from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \ CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \ - MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \ - SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss + MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, \ + SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \ MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, \ @@ -54,5 +54,5 @@ 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold', 'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer', - 'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', + 'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'TripletMarginWithDistanceLoss' ] diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 62323fda40f4..91a62a85771e 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1,11 +1,12 @@ import warnings +from .distance import PairwiseDistance from .module import Module from .. import functional as F from .. import _reduction as _Reduction from torch import Tensor -from typing import Optional +from typing import Callable, Optional class _Loss(Module): @@ -1191,6 +1192,9 @@ class TripletMarginLoss(_Loss): .. math:: d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p + See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the + triplet margin loss for input tensors using a custom distance function. + Args: margin (float, optional): Default: :math:`1`. p (int, optional): The norm degree for pairwise distance. Default: :math:`2`. @@ -1215,7 +1219,8 @@ class TripletMarginLoss(_Loss): Shape: - Input: :math:`(N, D)` where :math:`D` is the vector dimension. - - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`. + - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar + otherwise. >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2) >>> anchor = torch.randn(100, 128, requires_grad=True) @@ -1246,6 +1251,120 @@ def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: eps=self.eps, swap=self.swap, reduction=self.reduction) +class TripletMarginWithDistanceLoss(_Loss): + r"""Creates a criterion that measures the triplet loss given input + tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor, + positive, and negative examples, respectively), and a nonnegative, + real-valued function ("distance function") used to compute the relationship + between the anchor and positive example ("positive distance") and the + anchor and negative example ("negative distance"). + + The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``) + can be described as: + + .. math:: + \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad + l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} + + where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function + quantifying the closeness of two tensors, referred to as the :attr:`distance_function`; + and :math:`margin` is a non-negative margin representing the minimum difference + between the positive and negative distances that is required for the loss to + be 0. The input tensors have :math:`N` elements each and can be of any shape + that the distance function can handle. + + If :attr:`reduction` is not ``'none'`` + (default ``'mean'``), then: + + .. math:: + \ell(x, y) = + \begin{cases} + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + + See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet + loss for input tensors using the :math:`l_p` distance as the distance function. + + Args: + distance_function (callable, optional): A nonnegative, real-valued function that + quantifies the closeness of two tensors. If not specified, + `nn.PairwiseDistance` will be used. Default: ``None`` + margin (float, optional): A non-negative margin representing the minimum difference + between the positive and negative distances required for the loss to be 0. Larger + margins penalize cases where the negative examples are not distant enough from the + anchors, relative to the positives. Default: :math:`1`. + swap (bool, optional): Whether to use the distance swap described in the paper + `Learning shallow convolutional feature descriptors with triplet losses` by + V. Balntas, E. Riba et al. If True, and if the positive example is closer to the + negative example than the anchor is, swaps the positive example and the anchor in + the loss computation. Default: ``False``. + reduction (string, optional): Specifies the (optional) reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, + ``'mean'``: the sum of the output will be divided by the number of + elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'`` + + + Shape: + - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions + as supported by the distance function. + - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar + otherwise. + + Examples:: + + >>> # Initialize embeddings + >>> embedding = nn.Embedding(1000, 128) + >>> anchor_ids = torch.randint(0, 1000, (1,), requires_grad=True) + >>> positive_ids = torch.randint(0, 1000, (1,), requires_grad=True) + >>> negative_ids = torch.randint(0, 1000, (1,), requires_grad=True) + >>> anchor = embedding(anchor_ids) + >>> positive = embedding(positive_ids) + >>> negative = embedding(negative_ids) + >>> + >>> # Built-in Distance Function + >>> triplet_loss = \ + >>> nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance()) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + >>> + >>> # Custom Distance Function + >>> def l_infinity(x1, x2): + >>> return torch.max(torch.abs(x1 - x2), dim=1).values + >>> + >>> triplet_loss = \ + >>> nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + >>> + >>> # Custom Distance Function (Lambda) + >>> triplet_loss = \ + >>> nn.TripletMarginWithDistanceLoss( + >>> distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)) + >>> output = triplet_loss(anchor, positive, negative) + >>> output.backward() + + Reference: + V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses: + http://www.bmva.org/bmvc/2016/papers/paper119/index.html + """ + __constants__ = ['margin', 'swap', 'reduction'] + margin: float + swap: bool + + def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None, + margin: float = 1.0, swap: bool = False, reduction: str = 'mean'): + super(TripletMarginWithDistanceLoss, self).__init__(size_average=None, reduce=None, reduction=reduction) + self.distance_function = distance_function if distance_function is not None else PairwiseDistance() + self.margin = margin + self.swap = swap + + def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: + return F.triplet_margin_with_distance_loss(anchor, positive, negative, + distance_function=self.distance_function, + margin=self.margin, swap=self.swap, reduction=self.reduction) + + class CTCLoss(_Loss): r"""The Connectionist Temporal Classification loss. diff --git a/torch/overrides.py b/torch/overrides.py index d17c6c4f7473..b287bf17958a 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -624,6 +624,9 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1, torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean': -1), + torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *, + distance_function=None, margin=1.0, + swap=False, reduction='mean': -1), torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1, torch.nonzero: lambda input, as_tuple=False: -1, torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1, From e2b40ce793674531e0435510c6a1fe8f63e3958a Mon Sep 17 00:00:00 2001 From: Hong Xu Date: Tue, 22 Sep 2020 11:40:12 -0700 Subject: [PATCH 018/449] Support BFloat16 for binary logical operators on CUDA (#42485) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42485 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D23684423 Pulled By: mruberry fbshipit-source-id: edc2b46b726361d4c8bf8a4bf4e4a09197b20428 --- .../ATen/native/cuda/BinaryLogicalOpsKernels.cu | 6 +++--- test/test_torch.py | 16 ---------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu index 2a9b188520f5..20a851d1b2ce 100644 --- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu @@ -10,7 +10,7 @@ namespace at { namespace native { void logical_and_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_and_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_and_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a && b; }); @@ -18,7 +18,7 @@ void logical_and_kernel_cuda(TensorIterator& iter) { } void logical_or_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_or_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_or_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a || b; }); @@ -26,7 +26,7 @@ void logical_or_kernel_cuda(TensorIterator& iter) { } void logical_xor_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_xor_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_xor_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return bool(a) != bool(b); }); diff --git a/test/test_torch.py b/test/test_torch.py index c8dfd5115333..440bf30286bb 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6342,16 +6342,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): a = torch.tensor(a_, dtype=dtypes[0], device=device) b = torch.tensor(b_, dtype=dtypes[1], device=device) - # Skip bfloat16 on CUDA. Remove this after bfloat16 is supported on CUDA. - # After type promotion of bfloat16 is supported, some bfloat16 logical operation will go through on - # CUDA as long as the two tensors are promoted to a supported type. - # TODO: Remove this once logical operators are improved to take care of bfloat16. - if self.device_type == 'cuda' and torch.bfloat16 in dtypes: - if torch.promote_types(dtypes[0], dtypes[1]) == torch.bfloat16: - with self.assertRaises(RuntimeError): - getattr(a, op)(b) - return - if dtypes[0].is_complex or dtypes[1].is_complex: with self.assertRaises(RuntimeError): getattr(a, op)(b) @@ -6371,12 +6361,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): getattr(a, op + '_')(b) return - # TODO: remove when logical ops support bfloat16 on CUDA. - if self.device_type == 'cuda' and dtypes[0] == torch.bfloat16: - with self.assertRaises(RuntimeError): - getattr(a, op + '_')(b) - return - # TODO: remove when complex ops are supported if dtypes[0].is_complex: with self.assertRaises(RuntimeError): From 09aee06e821546d1aaee345143183e42261cb674 Mon Sep 17 00:00:00 2001 From: Daya Khudia Date: Tue, 22 Sep 2020 11:42:57 -0700 Subject: [PATCH 019/449] [caffe2] Replace embedding conversion ops with fbgemm functions (#44843) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44843 Replace perfkernels calls with fbgemm kernels to avoid code duplication ghstack-source-id: 112496292 Test Plan: CI Reviewed By: radkris-git Differential Revision: D23675519 fbshipit-source-id: 05c285a9eeb9ea109a04a78cb442a24ee40a4aec --- .../fused_nbit_rowwise_conversion.cc | 92 ++- .../fused_nbit_rowwise_conversion_avx2.cc | 534 ------------------ test/quantization/test_quantized_op.py | 4 +- 3 files changed, 34 insertions(+), 596 deletions(-) delete mode 100644 caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc index 528bbee3c2ca..35b9605021e6 100644 --- a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc +++ b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc @@ -6,6 +6,10 @@ #include "common.h" +#ifdef USE_FBGEMM +#include "fbgemm/QuantUtils.h" +#endif + namespace caffe2 { void FloatToFused8BitRowwiseQuantized__base( @@ -58,46 +62,32 @@ void Fused8BitRowwiseQuantizedToFloat__base( } } -decltype(FloatToFused8BitRowwiseQuantized__base) - FloatToFused8BitRowwiseQuantized__avx2_fma; void FloatToFused8BitRowwiseQuantized( const float* input, int input_rows, int input_columns, std::uint8_t* output) { - AVX2_FMA_DO( - FloatToFused8BitRowwiseQuantized, - input, - input_rows, - input_columns, - output); - BASE_DO( - FloatToFused8BitRowwiseQuantized, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat( + input, input_rows, input_columns, output); +#else + FloatToFused8BitRowwiseQuantized__base( + input, input_rows, input_columns, output); +#endif } -decltype(Fused8BitRowwiseQuantizedToFloat__base) - Fused8BitRowwiseQuantizedToFloat__avx2_fma; void Fused8BitRowwiseQuantizedToFloat( const std::uint8_t* input, int input_rows, int input_columns, float* output) { - AVX2_FMA_DO( - Fused8BitRowwiseQuantizedToFloat, - input, - input_rows, - input_columns, - output); - BASE_DO( - Fused8BitRowwiseQuantizedToFloat, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat( + input, input_rows, input_columns, output); +#else + Fused8BitRowwiseQuantizedToFloat__base( + input, input_rows, input_columns, output); +#endif } void FloatToFusedNBitRowwiseQuantizedSBHalf__base( @@ -184,52 +174,34 @@ void FusedNBitRowwiseQuantizedSBHalfToFloat__base( } } -decltype(FloatToFusedNBitRowwiseQuantizedSBHalf__base) - FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma; void FloatToFusedNBitRowwiseQuantizedSBHalf( int bit_rate, const float* input, int input_rows, int input_columns, std::uint8_t* output) { - AVX2_FMA_DO( - FloatToFusedNBitRowwiseQuantizedSBHalf, - bit_rate, - input, - input_rows, - input_columns, - output); - BASE_DO( - FloatToFusedNBitRowwiseQuantizedSBHalf, - bit_rate, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf( + bit_rate, input, input_rows, input_columns, output); +#else + FloatToFusedNBitRowwiseQuantizedSBHalf__base( + bit_rate, input, input_rows, input_columns, output); +#endif } -decltype(FusedNBitRowwiseQuantizedSBHalfToFloat__base) - FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma; void FusedNBitRowwiseQuantizedSBHalfToFloat( int bit_rate, const std::uint8_t* input, int input_rows, int input_columns, float* output) { - AVX2_FMA_DO( - FusedNBitRowwiseQuantizedSBHalfToFloat, - bit_rate, - input, - input_rows, - input_columns, - output); - BASE_DO( - FusedNBitRowwiseQuantizedSBHalfToFloat, - bit_rate, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat( + bit_rate, input, input_rows, input_columns, output); +#else + FusedNBitRowwiseQuantizedSBHalfToFloat__base( + bit_rate, input, input_rows, input_columns, output); +#endif } } // namespace caffe2 diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc deleted file mode 100644 index e7053b5136c0..000000000000 --- a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc +++ /dev/null @@ -1,534 +0,0 @@ -#include "./fused_nbit_rowwise_conversion.h" - -#include -#include -#include // for FLT_MAX -#include - -#include "./cvtsh_ss_bugfix.h" - -namespace caffe2 { - -constexpr int VLEN = 8; - -void FloatToFused8BitRowwiseQuantized__avx2_fma( - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - constexpr float kEpsilon = 1e-8f; - - __m256i permute_mask1_v = - _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); - __m256i shuffle_mask_v = _mm256_set_epi8( - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00); - __m256i permute_mask2_v = - _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); - - int output_columns = input_columns + 2 * sizeof(float); - for (std::size_t row = 0; row < input_rows; ++row) { - const float* input_row = input + row * input_columns; - std::uint8_t* output_row = output + row * output_columns; - float* output_row_scale_bias = - reinterpret_cast(output_row + input_columns); - - float minimum_element = FLT_MAX; - float maximum_element = -FLT_MAX; - __m256 min_v = _mm256_set1_ps(minimum_element); - __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; - for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_loadu_ps(input_row + col); - min_v = _mm256_min_ps(min_v, in_v); - max_v = _mm256_max_ps(max_v, in_v); - } - alignas(64) float min_buf[VLEN], max_buf[VLEN]; - _mm256_store_ps(min_buf, min_v); - _mm256_store_ps(max_buf, max_v); - for (int i = 0; i < VLEN; ++i) { - minimum_element = std::min(minimum_element, min_buf[i]); - maximum_element = std::max(maximum_element, max_buf[i]); - } - for (; col < input_columns; ++col) { - minimum_element = std::min(minimum_element, input_row[col]); - maximum_element = std::max(maximum_element, input_row[col]); - } - - float range = maximum_element - minimum_element; - - output_row_scale_bias[0] = range / 255.0f; - output_row_scale_bias[1] = minimum_element; - const auto inverse_scale = 255.0f / (range + kEpsilon); - min_v = _mm256_set1_ps(minimum_element); - __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); - - for (col = 0; col < input_columns / (4 * VLEN) * (4 * VLEN); - col += 4 * VLEN) { - __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v), - inverse_scale_v)); - __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v), - inverse_scale_v)); - __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v), - inverse_scale_v)); - - // An instruction sequence to save 32 32-bit integers as 8-bit integers - __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); - __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); - __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v); - xyzw_packed_v = - _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v); - _mm256_storeu_si256( - reinterpret_cast<__m256i*>(output_row + col), xyzw_packed_v); - } - for (; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256i rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - - // An instruction sequence to save 8 32-bit integers as 8-bit integers - rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v); - rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask2_v); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(output_row + col), - _mm256_castsi256_si128(rounded_v)); - } - for (; col < input_columns; ++col) { - output_row[col] = - std::lrintf((input_row[col] - minimum_element) * inverse_scale); - } - } -} - -void Fused8BitRowwiseQuantizedToFloat__avx2_fma( - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - int output_columns = input_columns - 2 * sizeof(float); - - for (std::size_t row = 0; row < input_rows; ++row) { - const std::uint8_t* input_row = input + row * input_columns; - const float* input_row_scale_bias = - reinterpret_cast(input_row + output_columns); - float* output_row = output + row * output_columns; - - __m256 scale_v = _mm256_set1_ps(input_row_scale_bias[0]); - __m256 bias_v = _mm256_set1_ps(input_row_scale_bias[1]); - - std::size_t col; - for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( - _mm_loadl_epi64(reinterpret_cast(input_row + col)))); - _mm256_storeu_ps( - output_row + col, - _mm256_add_ps(_mm256_mul_ps(in_v, scale_v), bias_v)); - } - - for (; col < output_columns; ++col) { - output_row[col] = - input_row[col] * input_row_scale_bias[0] + input_row_scale_bias[1]; - } - } -} - -namespace { - -template -void FloatToFusedNBitRowwiseQuantizedSBHalf_( - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - __m256i permute_mask1_v = - _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); - - int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; - int output_columns = - (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE + - 2 * sizeof(std::uint16_t); - for (std::size_t row = 0; row < input_rows; ++row) { - const float* input_row = input + row * input_columns; - std::uint8_t* output_row = output + row * output_columns; - std::uint16_t* output_row_scale_bias = reinterpret_cast( - output_row + - (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - - float minimum_element = FLT_MAX; - float maximum_element = -FLT_MAX; - __m256 min_v = _mm256_set1_ps(minimum_element); - __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; - for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_loadu_ps(input_row + col); - min_v = _mm256_min_ps(min_v, in_v); - max_v = _mm256_max_ps(max_v, in_v); - } - alignas(64) float min_buf[VLEN], max_buf[VLEN]; - _mm256_store_ps(min_buf, min_v); - _mm256_store_ps(max_buf, max_v); - for (int i = 0; i < VLEN; ++i) { - minimum_element = std::min(minimum_element, min_buf[i]); - maximum_element = std::max(maximum_element, max_buf[i]); - } - for (; col < input_columns; ++col) { - minimum_element = std::min(minimum_element, input_row[col]); - maximum_element = std::max(maximum_element, input_row[col]); - } - - output_row_scale_bias[1] = _cvtss_sh( - minimum_element, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - minimum_element = _cvtsh_ss(output_row_scale_bias[1]); - const float range = maximum_element - minimum_element; - - float scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1); - std::uint16_t scale_fp16 = - _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - scale = _cvtsh_ss(scale_fp16); - if (scale == 0) { - // Corner case handling when maximum_element == minimum_element - // Any scale would work because maximum_element - minimum_element will be - // 0 for all X - scale = 1.0f; - } - float inverse_scale = 1.0f / scale; - if (std::isinf(inverse_scale)) { - scale = 1.0f; - inverse_scale = 1.0f; - } - - output_row_scale_bias[0] = - _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - - __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); - min_v = _mm256_set1_ps(minimum_element); - - col = 0; - - if (BIT_RATE == 2 || BIT_RATE == 4) { - for (; col + 4 * VLEN <= input_columns; col += 4 * VLEN) { - __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v), - inverse_scale_v)); - __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v), - inverse_scale_v)); - __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v), - inverse_scale_v)); - - // An instruction sequence to save 32 32-bit integers as 8-bit integers - __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); - __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); - __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v); - xyzw_packed_v = - _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v); - - // saturate to BIT_RATE - xyzw_packed_v = _mm256_min_epu8( - xyzw_packed_v, - _mm256_set1_epi8(static_cast((1 << BIT_RATE) - 1))); - - if (BIT_RATE == 4) { - // pack into lower 8-bit of each 16-bit - xyzw_packed_v = _mm256_and_si256( - _mm256_or_si256( - xyzw_packed_v, _mm256_srli_epi16(xyzw_packed_v, 4)), - _mm256_set1_epi16(0x00ff)); - } else { - // pack into lower 8-bit of each 32-bit - xyzw_packed_v = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - xyzw_packed_v, _mm256_srli_epi32(xyzw_packed_v, 6)), - _mm256_or_si256( - _mm256_srli_epi32(xyzw_packed_v, 8 + 4), - _mm256_srli_epi32(xyzw_packed_v, 2 * 8 + 2))), - _mm256_set1_epi32(0x00ff)); - } - - __m128i out_v; - if (BIT_RATE == 4) { - // avx2 doesn't have _mm256_cvtepi16_epi8 - out_v = _mm_packus_epi16( - _mm256_castsi256_si128(xyzw_packed_v), - _mm256_extractf128_si256(xyzw_packed_v, 1)); - _mm_storeu_si128( - reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE), - out_v); - } else { - // avx2 doesn't have _mm256_cvtepi32_epi8 - out_v = _mm_packus_epi32( - _mm256_castsi256_si128(xyzw_packed_v), - _mm256_extractf128_si256(xyzw_packed_v, 1)); - out_v = _mm_packus_epi16(out_v, out_v); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE), - out_v); - } - } - } - - for (; col < input_columns; ++col) { - float X = input_row[col]; - std::uint8_t quantized = std::max( - 0, - std::min( - std::lrintf((X - minimum_element) * inverse_scale), - (1 << BIT_RATE) - 1)); - if (col % NUM_ELEM_PER_BYTE == 0) { - output_row[col / NUM_ELEM_PER_BYTE] = quantized; - } else { - output_row[col / NUM_ELEM_PER_BYTE] |= - (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE)); - } - } - } -} - -template -void FusedNBitRowwiseQuantizedSBHalfToFloat_( - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - constexpr int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; - int output_columns = - (input_columns - 2 * sizeof(std::uint16_t)) * NUM_ELEM_PER_BYTE; - - // mask can be accessed by avx2_ps_or_epi32_combined_mask[(8 - remainder) % 8] - static const int avx2_ps_or_epi32_combined_mask[16] = { - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - }; - - // Compute a remainder for vector load - // Since every row is followed by 2 fp16 (scale and bias), luckily - // we don't need mask at bit-rate granularity but just at 32-bit - // granularity. - constexpr int NUM_ELEM_PER_32BIT = 32 / BIT_RATE; - // multiply by 4 because we're handling 4 vlen per iteration - constexpr int NUM_OF_32BIT_PER_VLOAD = VLEN * 4 / NUM_ELEM_PER_32BIT; - int remainder_32bit_granularity = (output_columns + NUM_ELEM_PER_32BIT - 1) / - NUM_ELEM_PER_32BIT % NUM_OF_32BIT_PER_VLOAD; - __m128i vmask_load = _mm_lddqu_si128(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + NUM_OF_32BIT_PER_VLOAD + - (NUM_OF_32BIT_PER_VLOAD - remainder_32bit_granularity) % - NUM_OF_32BIT_PER_VLOAD)); - int remainder = output_columns % (4 * VLEN); - __m256i vmask_store0 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - std::min(output_columns % (4 * VLEN), VLEN) % (VLEN + 1)))); - __m256i vmask_store1 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - VLEN, VLEN)) % - (VLEN + 1)))); - __m256i vmask_store2 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - 2 * VLEN, VLEN)) % - (VLEN + 1)))); - __m256i vmask_store3 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - 3 * VLEN, VLEN)) % - (VLEN + 1)))); - - for (std::size_t row = 0; row < input_rows; ++row) { - const std::uint8_t* input_row = input + row * input_columns; - const std::uint16_t* input_row_scale_bias = - reinterpret_cast( - input_row + - (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - float scale = _cvtsh_ss(input_row_scale_bias[0]); - float bias = _cvtsh_ss(input_row_scale_bias[1]); - float* output_row = output + row * output_columns; - - std::size_t col = 0; - if (BIT_RATE == 4 || BIT_RATE == 2) { - __m256 vscale = _mm256_set1_ps(scale); - __m256 vbias = _mm256_set1_ps(bias); - for (; col + 4 * VLEN <= output_columns; col += 4 * VLEN) { - __m256i vinq; - // unpack to 8-bit integers - if (BIT_RATE == 4) { - vinq = _mm256_cvtepu8_epi16( - _mm_loadu_si128(reinterpret_cast( - input_row + col / NUM_ELEM_PER_BYTE))); - vinq = _mm256_and_si256( - _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)), - _mm256_set1_epi16(0x0f0f)); - } else { - vinq = _mm256_cvtepu8_epi32( - _mm_loadl_epi64(reinterpret_cast( - input_row + col / NUM_ELEM_PER_BYTE))); - vinq = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_slli_epi32(vinq, 2 * 8 + 2), - _mm256_slli_epi32(vinq, 8 + 4)), - _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)), - _mm256_set1_epi32(0x03030303)); - } - __m256 vinq0 = _mm256_cvtepi32_ps( - _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq))); - __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1)))); - __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2)))); - __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3)))); - vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias); - vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias); - vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias); - vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias); - _mm256_storeu_ps(output_row + col, vinq0); - _mm256_storeu_ps(output_row + col + VLEN, vinq1); - _mm256_storeu_ps(output_row + col + 2 * VLEN, vinq2); - _mm256_storeu_ps(output_row + col + 3 * VLEN, vinq3); - } - - if (remainder) { - __m256i vinq; - if (BIT_RATE == 4) { - vinq = _mm256_cvtepu8_epi16(_mm_maskload_epi32( - reinterpret_cast(input_row + col / NUM_ELEM_PER_BYTE), - vmask_load)); - vinq = _mm256_and_si256( - _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)), - _mm256_set1_epi16(0x0f0f)); - } else { - vinq = _mm256_cvtepu8_epi32(_mm_maskload_epi32( - reinterpret_cast(input_row + col / NUM_ELEM_PER_BYTE), - vmask_load)); - vinq = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_slli_epi32(vinq, 2 * 8 + 2), - _mm256_slli_epi32(vinq, 8 + 4)), - _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)), - _mm256_set1_epi32(0x03030303)); - } - - __m256 vinq0 = _mm256_cvtepi32_ps( - _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq))); - __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1)))); - __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2)))); - __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3)))); - - vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias); - vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias); - vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias); - vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias); - - _mm256_maskstore_ps(output_row + col, vmask_store0, vinq0); - _mm256_maskstore_ps(output_row + col + VLEN, vmask_store1, vinq1); - _mm256_maskstore_ps(output_row + col + 2 * VLEN, vmask_store2, vinq2); - _mm256_maskstore_ps(output_row + col + 3 * VLEN, vmask_store3, vinq3); - } - } else { - for (; col < output_columns; ++col) { - std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE]; - quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE; - quantized &= (1 << BIT_RATE) - 1; - output_row[col] = scale * quantized + bias; - } - } - } -} -} // namespace - -void FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma( - int bit_rate, - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - if (bit_rate == 2) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<2>( - input, input_rows, input_columns, output); - } else if (bit_rate == 4) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<4>( - input, input_rows, input_columns, output); - } else if (bit_rate == 8) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<8>( - input, input_rows, input_columns, output); - } -} - -void FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma( - int bit_rate, - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - if (bit_rate == 2) { - FusedNBitRowwiseQuantizedSBHalfToFloat_<2>( - input, input_rows, input_columns, output); - } else if (bit_rate == 4) { - FusedNBitRowwiseQuantizedSBHalfToFloat_<4>( - input, input_rows, input_columns, output); - } else { - FusedNBitRowwiseQuantizedSBHalfToFloat_<8>( - input, input_rows, input_columns, output); - } -} - -} // namespace caffe2 diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py index 3a0e6f10bf33..674ace864343 100644 --- a/test/quantization/test_quantized_op.py +++ b/test/quantization/test_quantized_op.py @@ -2779,9 +2779,9 @@ def get_c2_weights(weights): w_packed_c2, w_unpacked_c2 = get_c2_weights(weights) # Compare packed weights against C2. - np.testing.assert_equal(w_packed.numpy(), w_packed_c2.numpy()) + np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6) # Compare unpacked weights against C2 - np.testing.assert_equal(w_unpacked.numpy(), w_unpacked_c2.numpy()) + np.testing.assert_allclose(w_unpacked.numpy(), w_unpacked_c2.numpy(), atol=1e-6, rtol=1e-6) """ Tests the correctness of the embedding_bag_8bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), From 2b1f25885e667b29f806afd01f68652393fbd07c Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 22 Sep 2020 11:44:14 -0700 Subject: [PATCH 020/449] [quant] Fix ConvTranspose mapping (#44844) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44844 Test Plan: Imported from OSS Reviewed By: jerryzh168 Differential Revision: D23746466 Pulled By: z-a-f fbshipit-source-id: cb84e0fef5ab82e8ed8dd118d9fb21ee7b480ef7 --- torch/quantization/quantization_mappings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py index 585b018a5b01..60d166ae4896 100644 --- a/torch/quantization/quantization_mappings.py +++ b/torch/quantization/quantization_mappings.py @@ -21,6 +21,8 @@ nn.Conv1d: nnq.Conv1d, nn.Conv2d: nnq.Conv2d, nn.Conv3d: nnq.Conv3d, + nn.ConvTranspose1d: nnq.ConvTranspose1d, + nn.ConvTranspose2d: nnq.ConvTranspose2d, nn.BatchNorm2d: nnq.BatchNorm2d, nn.BatchNorm3d: nnq.BatchNorm3d, nn.LayerNorm: nnq.LayerNorm, From c253b101545cc1a2ba5a4ab467cd972a63ac072d Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Tue, 22 Sep 2020 11:51:58 -0700 Subject: [PATCH 021/449] Fix incorrect EnumValue serialization issue (#44891) Summary: Previously, `prim::EnumValue` is serialized to `ops.prim.EnumValue`, which doesn't have the right implementation to refine return type. This diff correctly serializes it to enum.value, thus fixing the issue. Fixes https://github.com/pytorch/pytorch/issues/44892 Pull Request resolved: https://github.com/pytorch/pytorch/pull/44891 Reviewed By: malfet Differential Revision: D23818962 Pulled By: gmagogsfm fbshipit-source-id: 6edfdf9c4b932176b08abc69284a916cab10081b --- test/jit/test_enum.py | 20 +++++++++++++++++++ torch/csrc/jit/serialization/python_print.cpp | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py index a242217a94c1..aa34c22413ad 100644 --- a/test/jit/test_enum.py +++ b/test/jit/test_enum.py @@ -267,6 +267,26 @@ def forward(self): self.assertEqual(scripted(), Color.RED.value) + def test_string_enum_as_module_attribute(self): + global Color + + class Color(Enum): + RED = "red" + GREEN = "green" + + class TestModule(torch.nn.Module): + def __init__(self, e: Color): + super(TestModule, self).__init__() + self.e = e + + def forward(self): + return (self.e.name, self.e.value) + + m = TestModule(Color.RED) + scripted = torch.jit.script(m) + + self.assertEqual(scripted(), (Color.RED.name, Color.RED.value)) + def test_enum_return(self): global Color diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index b8339b5c86a7..e04339dacc22 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -1126,6 +1126,16 @@ struct PythonPrintImpl { stmt << useOf(node->input(0)) << ".tolist()" << ")"; } break; + case prim::EnumValue: + // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue + // because its return type depends on type of enum and must be further + // resolved, but ops.prim.EnumValue construction does not provide such + // functionality. + stmt << "(" << useOf(node->input()) << ").value"; + break; + case prim::EnumName: + stmt << "(" << useOf(node->input()) << ").name"; + break; default: { printOpName(stmt, node->kind()); const FunctionSchema& schema = node->schema(); From a4ce3f4194d2a5764d228f623b3daaf72480ed51 Mon Sep 17 00:00:00 2001 From: Viswesh Sankaran Date: Tue, 22 Sep 2020 13:37:40 -0700 Subject: [PATCH 022/449] Fix type hint warnings for common_methods_invocations.py (#44971) Summary: Fixes a subtask of https://github.com/pytorch/pytorch/issues/42969 Tested the following and no warnings were seen. python test/test_type_hints.py .... ---------------------------------------------------------------------- Ran 4 tests in 180.759s OK Pull Request resolved: https://github.com/pytorch/pytorch/pull/44971 Reviewed By: walterddr Differential Revision: D23822274 Pulled By: visweshfb fbshipit-source-id: e3485021e348ee0a8508a9d128f04bad721795ef --- mypy.ini | 3 --- torch/testing/_internal/common_methods_invocations.py | 8 +++++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mypy.ini b/mypy.ini index a7d4acea9571..1891f2790d1e 100644 --- a/mypy.ini +++ b/mypy.ini @@ -62,9 +62,6 @@ ignore_errors = True [mypy-torch.testing._internal.hypothesis_utils.*] ignore_errors = True -[mypy-torch.testing._internal.common_methods_invocations.*] -ignore_errors = True - [mypy-torch.testing._internal.common_nn.*] ignore_errors = True diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index dd429deacbf0..b208f220e30a 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -7,6 +7,8 @@ from torch._six import inf, istuple from torch.autograd import Variable +from typing import List, Tuple, Dict, Any + from torch.testing import \ (make_non_contiguous, _dispatch_dtypes, floating_types, floating_types_and, floating_and_complex_types, @@ -1540,7 +1542,7 @@ def _compare_large_trilu_indices( (2028, 1, -1) ] -tri_large_tests_args = [ +tri_large_tests_args: List[Tuple[int, ...]] = [ # Large test cases below are deliberately commented out to speed up CI # tests and to avoid OOM error. When modifying implementations of # tril_indices and triu_indices, please enable these tests and make sure @@ -1602,9 +1604,9 @@ def unpack_variables(args): 'reshape', 'where' # argument order } -EXCLUDE_GRADCHECK = { +EXCLUDE_GRADCHECK: Dict[str, Any] = { } -EXCLUDE_GRADGRADCHECK = { +EXCLUDE_GRADGRADCHECK: Dict[str, Any] = { } EXCLUDE_GRADGRADCHECK_BY_TEST_NAME = { # *det methods uses svd in backward when matrix is not invertible. However, From def433bbb6a914532bf3eb0687751a56e8dae685 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 22 Sep 2020 14:55:49 -0700 Subject: [PATCH 023/449] .circleci: Upgrade all xcode 9 workers to xcode 11 (#45153) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45153 xcode 9 is being deprectated within circleci infra so we should get everything else on a more recent version of xcode Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D23852774 Pulled By: seemethere fbshipit-source-id: c02e162f1993d408de439fee21b340e9640e5a24 --- .circleci/config.yml | 8 ++++---- .circleci/verbatim-sources/job-specs/binary-job-specs.yml | 4 ++-- .circleci/verbatim-sources/job-specs/job-specs-custom.yml | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5ca2d725b9e9..c952ee716b3d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -924,7 +924,7 @@ jobs: smoke_mac_test: <<: *binary_linux_test_upload_params macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - run: @@ -949,7 +949,7 @@ jobs: binary_mac_build: <<: *binary_mac_params macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout @@ -1253,7 +1253,7 @@ jobs: environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - run_brew_for_macos_build @@ -1287,7 +1287,7 @@ jobs: environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - attach_workspace: diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml index bd26e8b2b373..7e635f42bce4 100644 --- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml @@ -135,7 +135,7 @@ smoke_mac_test: <<: *binary_linux_test_upload_params macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - run: @@ -160,7 +160,7 @@ binary_mac_build: <<: *binary_mac_params macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml index 810f16922d5c..9cc75136cfdd 100644 --- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml @@ -109,7 +109,7 @@ environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - run_brew_for_macos_build @@ -143,7 +143,7 @@ environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test macos: - xcode: "9.4.1" + xcode: "11.2.1" steps: - checkout - attach_workspace: From 79fe794f871691f7c4f3727694a3b9a9339b32f3 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 22 Sep 2020 14:56:15 -0700 Subject: [PATCH 024/449] [FX] Make Graphs immutable and make GraphModule recompile after assigning graph (#44830) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44830 Test Plan: Imported from OSS Reviewed By: zdevito Differential Revision: D23743850 Pulled By: jamesr66a fbshipit-source-id: 501b92a89ff636c26abeff13105a75462384554c --- test/test_fx.py | 13 ++++++++----- torch/fx/graph.py | 20 ++++++++++++++++---- torch/fx/graph_module.py | 30 ++++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/test/test_fx.py b/test/test_fx.py index 89311e2a2873..f191a73c40c4 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -174,10 +174,12 @@ def forward(self, a, b): return a + b m = M() g = symbolic_trace(m).graph - t = Proxy(g.result) + new_g = torch.fx.Graph() + new_g.graph_copy(g) + t = Proxy(new_g.nodes[-1]) # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules. - g.output((t + t).node) - gm = GraphModule(m, g) + new_g.output((t + t).node) + gm = GraphModule(m, new_g) self.assertEqual(gm(3, 4), 14) @skipIfNoTorchVision @@ -466,9 +468,10 @@ def test_deepcopy_graphmodule_with_transform(self): traced = symbolic_trace(st) def transform(traced): - new_graph = copy.deepcopy(traced.graph) + new_graph = torch.fx.Graph() + new_graph.graph_copy(traced.graph) relu_out = new_graph.create_node( - op='call_method', target='neg', args=(new_graph.result,), kwargs={}) + op='call_method', target='neg', args=(new_graph.nodes[-1],), kwargs={}) new_graph.output(relu_out) return GraphModule(traced, new_graph) transformed = transform(traced) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index a63b7c8b35dc..6214f60c61e6 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -67,9 +67,21 @@ def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument: class Graph: def __init__(self): - self.nodes : List[Node] = [] + self._nodes : List[Node] = [] self._used_names : Dict[str, int] = {} # base name -> number + @property + def nodes(self): + return tuple(self._nodes) + + def graph_copy(self, g : 'Graph'): + """ + Append all nodes from graph `g` to this graph + """ + val_map : Dict[Node, Node] = {} + for node in g._nodes: + val_map[node] = self.node_copy(node, lambda n : val_map[n]) + def _mark_uses(self, a: Argument): def add_use(n: Node): n.uses += 1 @@ -86,7 +98,7 @@ def create_node(self, op: str, target: Target, self._mark_uses(args) self._mark_uses(kwargs) n = Node(self, name if name is not None else self._name(target), op, target, args, kwargs) - self.nodes.append(n) + self._nodes.append(n) return n # sugar for above when you know the op @@ -161,7 +173,7 @@ def _name(self, target: Target) -> str: def python_code(self, root_module: str) -> Tuple[str, str, List[str]]: free_vars: List[str] = [] body: List[str] = [] - for node in self.nodes: + for node in self._nodes: if node.op == 'placeholder': assert isinstance(node.target, str) free_vars.append(node.target) @@ -237,7 +249,7 @@ def format_node(n : Node) -> Optional[str]: f'args = {format_arg(n.args)}, kwargs = {format_arg(n.kwargs)})' - node_strs = [format_node(node) for node in self.nodes] + node_strs = [format_node(node) for node in self._nodes] param_str = ', '.join(placeholder_names) s = f'graph({param_str}):' for node_str in node_strs: diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index df40cbd84fe1..83feed72b752 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -97,6 +97,17 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str): setattr(to_module, field, from_obj) class GraphModule(torch.nn.Module): + """ + GraphModule is an nn.Module generated from an fx.Graph. GraphModule has + important attributes: + + graph : The graph from which this GraphModule was generated + code : The Python source code for the function generated from `graph` + forward : The Python method generated from `graph` + + Note that when `graph` is reassigned, `code` and `forward` will be automatically + regenerated. + """ def __new__(cls: 'Type[GraphModule]', *args, **kwargs): # each instance of a graph module needs its own forward method # so create a new singleton class for each instance. @@ -148,10 +159,21 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph): else: raise RuntimeError('Unsupported type ' + str(root) + ' passed for root!') self.graph = graph - self._generate_forward() - def _generate_forward(self) -> None: - body, result, free_variables = self.graph.python_code(root_module='self') + # TorchScript breaks trying to compile the graph setter because of the + # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842 + # + # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway + __ignored_properties__ = ['graph'] + + @property + def graph(self): + return self._graph + + @graph.setter + def graph(self, val) -> None: + self._graph = val + body, result, free_variables = self._graph.python_code(root_module='self') body = '\n'.join(' ' + line for line in body.split('\n')) + '\n' self.code = f"""\ def forward(self, {', '.join(free_variables)}): @@ -163,7 +185,7 @@ def forward(self, {', '.join(free_variables)}): def __reduce__(self): dict_without_graph = self.__dict__.copy() - del dict_without_graph['graph'] + del dict_without_graph['_graph'] return (deserialize_graphmodule, (dict_without_graph,)) # because __reduce__ is defined for serialization, From d1c68a706985d7115c42bfb007b4cf643d172050 Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Tue, 22 Sep 2020 15:06:14 -0700 Subject: [PATCH 025/449] Clarify that 5-D 'bilinear' grid_sample is actually trilinear (#45090) Summary: Fixes https://github.com/pytorch/pytorch/issues/41528 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45090 Reviewed By: ailzhang Differential Revision: D23841046 Pulled By: zou3519 fbshipit-source-id: 941770cd5b3e705608957739026e9113e5f0c616 --- torch/nn/functional.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index f4dbceeb88b1..2fdb40b2d93f 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -3331,6 +3331,9 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner or :math:`(N, D_\text{out}, H_\text{out}, W_\text{out}, 3)` (5-D case) mode (str): interpolation mode to calculate output values ``'bilinear'`` | ``'nearest'``. Default: ``'bilinear'`` + Note: When ``mode='bilinear'`` and the input is 5-D, the interpolation mode + used internally will actually be trilinear. However, when the input is 4-D, + the interpolation mode will legitimately be bilinear. padding_mode (str): padding mode for outside grid values ``'zeros'`` | ``'border'`` | ``'reflection'``. Default: ``'zeros'`` align_corners (bool, optional): Geometrically, we consider the pixels of the From cddcfde81d6482baa2e84fd1400e9ee60c4f9a3e Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Tue, 22 Sep 2020 16:28:39 -0700 Subject: [PATCH 026/449] [JIT] Fix WithTest.test_with_exceptions (#45106) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45106 **Summary** This commit fixes `WithTest.test_with_exceptions`. It's been running in regular Python this whole time; none of the functions created and invoked for the test were scripted. Fortunately, the tests still pass after being fixed. **Test Plan** Ran unit tests + continuous integration. Test Plan: Imported from OSS Reviewed By: gmagogsfm Differential Revision: D23848206 Pulled By: SplitInfinity fbshipit-source-id: fd975ee34db9441ef4e4a4abf2fb21298166bbaa --- test/jit/test_with.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/jit/test_with.py b/test/jit/test_with.py index 15e1362ea722..ffd0631639f6 100644 --- a/test/jit/test_with.py +++ b/test/jit/test_with.py @@ -359,6 +359,7 @@ def test_with_exceptions(self): Check that exceptions thrown in the bodies of with-statements are handled correctly. """ + global Context @torch.jit.script class Context(object): @@ -379,10 +380,12 @@ def __enter__(self): def __exit__(self, type: Any, value: Any, tb: Any): self.count.sub_(0.3) + @torch.jit.script def method_that_raises(): # type: () -> Tensor - raise Exception() + raise Exception("raised exception") + @torch.jit.script def test_exception(x, c): # type: (Tensor, Context) -> Tensor """ @@ -393,6 +396,7 @@ def test_exception(x, c): return x + @torch.jit.script def test_exception_nested(x, c): # type: (Tensor, Context) -> Tensor """ @@ -404,6 +408,7 @@ def test_exception_nested(x, c): return x + @torch.jit.script def with_that_raises(c): # type: (Context) -> Tensor a = torch.tensor([1]) @@ -413,6 +418,7 @@ def with_that_raises(c): return a + @torch.jit.script def test_exception_fn_call(x, c): # type: (Tensor, Context) -> Tensor """ @@ -426,15 +432,18 @@ def test_exception_fn_call(x, c): c = Context(1) - with self.assertRaises(Exception): + # checkScript and checkScriptRaisesRegex cannot be used because the string frontend will + # not compile class types (of which Context, the context manager being used for this test + # is one). + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception(torch.randn(2), c) self.assertEqual(c.count, 1) - with self.assertRaises(Exception): + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception_nested(torch.randn(2), c) self.assertEqual(c.count, 1) - with self.assertRaises(Exception): + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception_fn_call(torch.randn(2), c) self.assertEqual(c.count, 1) From 35cdb01327ddbfc886ca08a60064009fe362fdad Mon Sep 17 00:00:00 2001 From: hangjunxu Date: Tue, 22 Sep 2020 16:52:34 -0700 Subject: [PATCH 027/449] [PyTorch] Enable type check for autocast_test_lists (#45107) Summary: This is a sub-task for addressing: https://github.com/pytorch/pytorch/issues/42969. We re-enable type check for `autocast_test_lists `. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45107 Test Plan: `python test/test_type_hints.py` passed: ``` (pytorch) bash-5.0$ with-proxy python test/test_type_hints.py .... ---------------------------------------------------------------------- Ran 4 tests in 103.871s OK ``` Reviewed By: walterddr Differential Revision: D23842884 Pulled By: Hangjun fbshipit-source-id: a39f3810e3abebc6b4c1cb996b06312f6d42ffd6 --- mypy.ini | 3 --- torch/testing/_internal/autocast_test_lists.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/mypy.ini b/mypy.ini index 1891f2790d1e..07cdbc4dd6fa 100644 --- a/mypy.ini +++ b/mypy.ini @@ -56,9 +56,6 @@ ignore_errors = True [mypy-torch.testing._internal.codegen.*] ignore_errors = True -[mypy-torch.testing._internal.autocast_test_lists.*] -ignore_errors = True - [mypy-torch.testing._internal.hypothesis_utils.*] ignore_errors = True diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py index 015cbd658816..13f65952af24 100644 --- a/torch/testing/_internal/autocast_test_lists.py +++ b/torch/testing/_internal/autocast_test_lists.py @@ -155,8 +155,7 @@ def __init__(self, dev): ("norm", pointwise0_fp16, {"p": 1}), ("norm", pointwise0_fp16, {"p": 1, "dim": 0}), ("cosine_similarity", mat0_fp16 + mat1_fp16), - ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8, - torch.nn.functional._Reduction.get_enum('mean'))), + ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))), ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.float16), torch.tensor([[1, 3, 4]], device=dev, dtype=torch.float16), torch.tensor([1], device=dev, dtype=torch.int))), From 7f4a27be3a23487cb74c578d31535c7e4c8aa6c4 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 22 Sep 2020 17:04:35 -0700 Subject: [PATCH 028/449] [resubmit][FX] s/get_param/get_attr/ (#45147) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45147 ghstack-source-id: 112605923 Test Plan: Imported from OSS Reviewed By: eellison Differential Revision: D23845096 fbshipit-source-id: 9ca209aa84cbaddd6e89c52b541e43b11197e2d5 --- test/fx/quantization.py | 2 +- test/test_fx.py | 6 +++--- torch/fx/__init__.py | 4 ++-- torch/fx/graph.py | 10 +++++----- torch/fx/graph_module.py | 4 ++-- torch/fx/symbolic_trace.py | 8 ++++---- torch/quantization/fx/quantize.py | 4 ++-- torch/quantization/fx/utils.py | 4 ++-- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/test/fx/quantization.py b/test/fx/quantization.py index 8116ed5ce89a..968c797c9163 100644 --- a/test/fx/quantization.py +++ b/test/fx/quantization.py @@ -222,7 +222,7 @@ def load_arg(a): for node in self.graph.nodes: if node.op == 'placeholder': result = next(args_iter) - elif node.op == 'get_param': + elif node.op == 'get_attr': result = self.state_dict[node.target] elif node.op == 'call_function': result = node.target(*load_arg(node.args), **load_arg(node.kwargs)) diff --git a/test/test_fx.py b/test/test_fx.py index f191a73c40c4..a48274e16809 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -338,7 +338,7 @@ def __init__(self, interpreter): placeholder_nodes.append(graph.create_node('placeholder', name)) # Get the interpreter object - interpreter_node = graph.create_node('get_param', 'interpreter') + interpreter_node = graph.create_node('get_attr', 'interpreter') # Add a node to call the interpreter instance output_node = graph.create_node( @@ -570,7 +570,7 @@ def test_graph_fns(self): g = Graph() a = g.placeholder('a') b = g.call_module('linear', (a,)) - c = g.get_param('bias') + c = g.get_attr('bias') d = g.call_method('add', (b, c)) e = g.call_function(torch.sin, (d,)) g.output(e) @@ -587,7 +587,7 @@ def test_construct_root_dict(self): graph : torch.fx.Graph = torch.fx.Graph() a : torch.fx.Node = graph.create_node('placeholder', 'x') b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,)) - c : torch.fx.Node = graph.create_node('get_param', 'zip.zap.zam') + c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam') d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c)) graph.output(d) diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py index 5b90c434340c..185511460740 100644 --- a/torch/fx/__init__.py +++ b/torch/fx/__init__.py @@ -36,7 +36,7 @@ def forward(self, x): opcode name target args kwargs ------------- ------------- ------------------------------------------------------- ------------------ ----------- placeholder x x () {} -get_param linear_weight linear.weight () {} +get_attr linear_weight linear.weight () {} call_function add_1 (x, linear_weight) {} call_module linear_1 linear (add_1,) {} call_method relu_2 relu [linear_1] {} @@ -48,7 +48,7 @@ def forward(self, x): - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on. `target` is similarly the name of the argument. `args` and `kwargs` are don't-care -- `get_param` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the +- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy. `args` and `kwargs` are don't-care - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 6214f60c61e6..6ca60f6211aa 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -92,7 +92,7 @@ def create_node(self, op: str, target: Target, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, name: Optional[str] = None) -> Node: - assert op in ('call_function', 'call_method', 'get_param', 'call_module', 'placeholder') + assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder') args = () if args is None else args kwargs = {} if kwargs is None else kwargs self._mark_uses(args) @@ -105,8 +105,8 @@ def create_node(self, op: str, target: Target, def placeholder(self, name: str) -> Node: return self.create_node('placeholder', name) - def get_param(self, name: str) -> Node: - return self.create_node('get_param', name) + def get_attr(self, name: str) -> Node: + return self.create_node('get_attr', name) def call_module(self, module_name: str, @@ -208,7 +208,7 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]: assert isinstance(node.target, str) body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})\n') continue - elif node.op == 'get_param': + elif node.op == 'get_attr': assert isinstance(node.target, str) body.append(f'{node.name} = {_format_target(root_module, node.target)}\n') continue @@ -242,7 +242,7 @@ def format_node(n : Node) -> Optional[str]: assert isinstance(n.target, str) placeholder_names.append(n.target) return None - elif n.op == 'get_param': + elif n.op == 'get_attr': return f'%{n.name} : [uses={n.uses}] = self.{n.target}' else: return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \ diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index 83feed72b752..e635819550ad 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -135,13 +135,13 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph): if hasattr(root, 'training'): self.training = root.training for node in graph.nodes: - if node.op in ['get_param', 'call_module']: + if node.op in ['get_attr', 'call_module']: assert isinstance(node.target, str) _copy_attr(root, self, node.target) elif isinstance(root, dict): targets_to_copy = [] for node in graph.nodes: - if node.op in ['get_param', 'call_module']: + if node.op in ['get_attr', 'call_module']: assert isinstance(node.target, str) if node.target not in root: raise RuntimeError('Node ' + str(node) + ' referenced target ' + node.target + diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py index 442fa28c36d9..9b192dd5501f 100644 --- a/torch/fx/symbolic_trace.py +++ b/torch/fx/symbolic_trace.py @@ -55,15 +55,15 @@ def create_arg(self, a: Any) -> Argument: if isinstance(a, torch.nn.Parameter): for n, p in self.root.named_parameters(): if a is p: - return self.create_node('get_param', n, (), {}) + return self.create_node('get_attr', n, (), {}) raise NameError('parameter is not a member of this module') # Tensors do not have a reliable string repr() from which they can be # constructed (and we probably don't want to rely on that, either), so # for any constant Tensor values we encounter, first search for if they # are an attribute of some module in the module hierarchy. If so, emit - # a get_param to retrieve that tensor. Otherwise, we'll store away the + # a get_attr to retrieve that tensor. Otherwise, we'll store away the # tensor value into a special attribute on the Module s.t. we can - # retrieve it with a get_param. + # retrieve it with a get_attr. if isinstance(a, torch.Tensor): # TODO: slow def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]: @@ -96,7 +96,7 @@ def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]: i += 1 setattr(self.root, qualname, a) - return self.create_node('get_param', qualname, (), {}) + return self.create_node('get_attr', qualname, (), {}) return super().create_arg(a) def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool: diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 8d8ef0f328c3..7967b4ec2dcb 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -177,7 +177,7 @@ def get_qconfig(module): self.qconfig_map = dict() for node in input_graph.nodes: - if node.op == 'get_param': + if node.op == 'get_attr': parent, _ = _parent_name(node.target) self.qconfig_map[node.name] = get_qconfig(self.modules[parent]) elif node.op == 'call_function': @@ -557,7 +557,7 @@ def load_arg(a): setattr(quantized_root, packed_weight_name, packed_weight) # replace prepack node with a getattr node env[node.name] = folded_graph.create_node( - 'get_param', packed_weight_name, (), {}) + 'get_attr', packed_weight_name, (), {}) elif prepack_node is not None: # remove the foled node continue diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 95d19df1e1b4..5d5532dc48fc 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -17,7 +17,7 @@ def graph_pretty_str(g, shorten=True) -> str: built_in_meth_re = re.compile('') op_dict = { 'placeholder': 'plchdr', - 'get_param': 'gt_prm', + 'get_attr': 'gt_prm', 'call_function': 'cl_fun', 'call_module': 'cl_mod', 'call_method': 'cl_meth', @@ -136,5 +136,5 @@ def get_next_qparams_idx(module, qparams): for key, value in qparams.items(): setattr(root_module, key + str(idx), value) qparam_full_path = key + str(idx) - inputs.append(graph.create_node('get_param', qparam_full_path)) + inputs.append(graph.create_node('get_attr', qparam_full_path)) return graph.create_node('call_function', quantize_op, tuple(inputs), {}) From ccfbfe5eb5c318d17f6994be31fe3f38261addff Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 22 Sep 2020 17:09:47 -0700 Subject: [PATCH 029/449] [quant][graphmode][fx] Custom module support (#44766) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44766 There might be modules that are not symbolically traceable, e.g. LSTM (since it has input dependent control flows), to support quantization in these cases, user will provide the corresponding observed and quantized version of the custom module, the observed custom module with observers already inserted in the module and the quantized version will have the corresponding ops quantized. And use ``` from torch.quantization import register_observed_custom_module_mapping from torch.quantization import register_quantized_custom_module_mapping register_observed_custom_module_mapping(CustomModule, ObservedCustomModule) register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule) ``` to register the custom module mappings, we'll also need to define a custom delegate class for symbolic trace in order to prevent the custom module from being traced: ```python class CustomDelegate(DefaultDelegate): def is_leaf_module(self, m): return (m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)) or \ isinstance(m, CustomModule) m = symbolic_trace(original_m, delegate_class=CustomDelegate) ``` Test Plan: Imported from OSS Reviewed By: z-a-f Differential Revision: D23723455 fbshipit-source-id: 50d666e29b94cbcbea5fb6bcc73b00cff87eb77a --- test/quantization/test_quantize_fx.py | 136 ++++++++++++++++++ torch/nn/quantized/modules/conv.py | 2 +- torch/quantization/__init__.py | 6 + .../custom_module_class_mappings.py | 75 ++++++++++ .../quantization/fx/quantization_patterns.py | 25 ++++ torch/quantization/fx/quantize.py | 32 ++++- 6 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 torch/quantization/custom_module_class_mappings.py diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 3170bfbfe8b4..fc4a735854ef 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -20,6 +20,8 @@ quantize_static_fx, quantize_dynamic_fx, prepare_qat_fx, + register_observed_custom_module_mapping, + register_quantized_custom_module_mapping, ) from torch.quantization import ( @@ -482,6 +484,140 @@ def forward(self, x): # Verify that loaded state dict produces same results. self.assertEqual(quant(x), quant_2(x)) + @skipIfNoFBGEMM + def test_custom_module_class(self): + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv(x) + + class ObservedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_float(cls, float_module): + assert hasattr(float_module, 'qconfig') + observed = cls(float_module.conv) + observed.qconfig = float_module.qconfig + return observed + + class QuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + observed_module.conv.activation_post_process = \ + observed_module.activation_post_process + quantized = cls(nnq.Conv2d.from_float(observed_module.conv)) + return quantized + + class DynamicallyQuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + quantized = cls(nnqd.Conv2d.from_float(observed_module.conv)) + return quantized + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.custom = CustomModule() + + def forward(self, x): + x = self.conv(x) + x = self.custom(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + data = torch.randn(1, 1, 1, 1) + # instantiate M and RefM and align the parameters + original_m = M() + original_ref_m = RefM() + original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach()) + original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach()) + + from torch.fx.symbolic_trace import Tracer + + # define a custom tracer to not trace through the custom module + + class CustomTracer(Tracer): + def is_leaf_module(self, m, module_qualified_name): + return (m.__module__.startswith('torch.nn') and + not isinstance(m, torch.nn.Sequential)) or \ + isinstance(m, CustomModule) + + # TODO: add other quant types after mixed mode support + for quant_type in [QuantType.STATIC]: + # register observed and quantized custom module classes + register_observed_custom_module_mapping(CustomModule, ObservedCustomModule) + register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule) + + m = CustomTracer().trace(original_m).eval() + qconfig_dict = {'': default_qconfig} + # check prepared model + m = prepare_static_fx(m, qconfig_dict) + # calibration + m(data) + # all activation observers are inserted in the top level module + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 3 + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + + # check converted/quantized model + m = convert_static_fx(m) + count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method('dequantize') : 1, + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + res = m(data) + + # quantize the reference model + ref_m = symbolic_trace(original_ref_m).eval() + ref_m = prepare_fx(ref_m, qconfig_dict) + ref_m(data) + ref_m = convert_fx(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops """ diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py index fe1ced91624f..773a9a37fbb3 100644 --- a/torch/nn/quantized/modules/conv.py +++ b/torch/nn/quantized/modules/conv.py @@ -146,7 +146,7 @@ def __setstate__(self, state): @classmethod def get_qconv(cls, mod, activation_post_process, weight_post_process=None): - r"""Creates a qconv object and returns it. + r"""Creates a qconv object and returns it. """ if weight_post_process is None: weight_post_process = mod.qconfig.weight() diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py index ed908ddf85c3..3193c332469f 100644 --- a/torch/quantization/__init__.py +++ b/torch/quantization/__init__.py @@ -9,6 +9,7 @@ from .quantize_fx import * from .quantization_mappings import * from .fuser_method_mappings import * +from .custom_module_class_mappings import * def default_eval_fn(model, calib_data): r""" @@ -40,6 +41,11 @@ def default_eval_fn(model, calib_data): 'get_compare_output_module_list', 'register_quantized_operator_mapping', 'get_quantized_operator', 'register_fuser_method', 'get_fuser_method', + 'register_observed_custom_module_mapping', + 'get_observed_custom_module_class', + 'register_quantized_custom_mdoule_mapping', + 'get_quantized_custom_module_class', + 'is_custom_module_class', # Sub functions for `prepare` and `swap_module` 'propagate_qconfig_', 'add_quant_dequant', 'add_observer_', 'swap_module', 'default_eval_fn', 'get_observer_dict', diff --git a/torch/quantization/custom_module_class_mappings.py b/torch/quantization/custom_module_class_mappings.py new file mode 100644 index 000000000000..c62290228c5b --- /dev/null +++ b/torch/quantization/custom_module_class_mappings.py @@ -0,0 +1,75 @@ +OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS = dict() + +def register_observed_custom_module_mapping(float_custom_module_class, observed_custom_module_class): + """ Register a mapping from `float_custom_module_class` to + `observed_custom_module_class` + `observed_custom_module_class` will have a `from_float` classmethod, + which will return an observed custom module instance given + a float custom module instance. + This will be used in prepare step of post training static quantization or + quantization aware training + """ + assert hasattr(observed_custom_module_class, 'from_float'), 'from_float must be' + \ + ' defined in observed custom module class' + OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \ + observed_custom_module_class + +def get_observed_custom_module_class(float_custom_module_class): + """ Get the corresponding observed module class for a given + float custom module. + """ + observed_custom_module_class = \ + OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None) + assert observed_custom_module_class is not None, \ + 'Float Custom module class {}'.format(float_custom_module_class) + \ + ' does not have a corresponding observed module class' + return observed_custom_module_class + +QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS = dict() + +def register_quantized_custom_module_mapping(float_custom_module_class, quantized_custom_module_class): + """ Register a mapping from `float_custom_module_class` to `quantized_custom_module_class` + A quantized custom module class should accept quantized input and + return quantized output. (we can relax this condition in the + future if there is a need) + `quantized_custom_module_class` will have a `from_observed` classmethod, + which will return an quantized custom module instance given + a observed custom module instance. + This will be used in prepare step of post training static quantization or + quantization aware training + """ + assert hasattr(quantized_custom_module_class, 'from_observed'), 'from_observed' + \ + ' must be defined in quantized custom module class' + QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \ + quantized_custom_module_class + +def get_quantized_custom_module_class(float_custom_module_class): + """ Get the corresponding quantized module class for a given + float custom module. + """ + quantized_custom_module_class = \ + QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None) + assert quantized_custom_module_class is not None, \ + 'Float Custom module class {}'.format(float_custom_module_class) + \ + ' does not have a corresponding quantized module class' + return quantized_custom_module_class + +def is_custom_module_class(module_class): + """ Check if a given module class is a custom module class + """ + return module_class in OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS and \ + module_class in QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS + +def mark_observed_custom_module(module, custom_module_class): + """ Mark a module as observed custom module, so that + it can be identified during convert step + """ + module._is_observed_custom_module = True + module._FLOAT_MODULE = custom_module_class + +def is_observed_custom_module(module): + """ Check if a module is marked as observed custom module + or not + """ + return hasattr(module, '_is_observed_custom_module') and \ + module._is_observed_custom_module diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index fa5a8733bbf7..ab85c9a9daff 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -6,6 +6,9 @@ get_static_quant_module_class, get_quantized_operator, ) +from ..custom_module_class_mappings import ( + get_quantized_custom_module_class, +) from .pattern_utils import ( register_quant_pattern, register_dynamic_quant_pattern, @@ -507,6 +510,28 @@ def convert(self, quantizer, node): quantizer.quantized_graph, node, quantizer.activation_post_process_map[node.name]) +class CustomModuleQuantizeHandler(QuantizeHandler): + def convert(self, quantizer, node, load_arg, debug=False): + """ Convert a float custom module to quantized custom module + """ + assert node.op == 'call_module' + observed_custom_module = quantizer.modules[node.target] + if node.name in quantizer.activation_post_process_map: + observed_custom_module.activation_post_process = \ + quantizer.activation_post_process_map[node.name] + quantized_custom_module_class = \ + get_quantized_custom_module_class(observed_custom_module._FLOAT_MODULE) + quantized_custom_module = \ + quantized_custom_module_class.from_observed(observed_custom_module) + parent_name, name = _parent_name(node.target) + setattr(quantizer.modules[parent_name], name, quantized_custom_module) + # hardcoded the qunatized input to be None (take whatever is in the environemnt), + # we can extend this + # if there is a need, e.g. get the indexes of quantized inputs from some + # module attribute like module._QUANTIZED_INPUT_INDEXES + return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None)) + + # 2. Post Training Dynamic Quantizatoin Patterns @register_dynamic_quant_pattern(torch.nn.Linear) @register_dynamic_quant_pattern(torch.nn.functional.linear) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 7967b4ec2dcb..8d742255838a 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -18,6 +18,12 @@ from ..quantization_mappings import ( get_qat_module_mappings, ) +from ..custom_module_class_mappings import ( + is_custom_module_class, + get_observed_custom_module_class, + mark_observed_custom_module, + is_observed_custom_module, +) from ..quantize import _remove_qconfig @@ -193,7 +199,6 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant): if not inplace: model = copy.deepcopy(model) self.is_dynamic_quant = is_dynamic_quant - # TODO: allow user specified patterns if self.is_dynamic_quant: self.patterns = get_dynamic_quant_patterns() else: @@ -235,6 +240,8 @@ def load_arg(a): env[node.name] = observed_graph.node_copy(node, load_arg) elif root_node is node: env[node.name] = observed_graph.node_copy(node, load_arg) + if qconfig is None: + continue def insert_observer(node, observer, device): get_new_observer_name = get_new_attr_name_with_prefix(prefix) @@ -246,10 +253,22 @@ def insert_observer(node, observer, device): if device: getattr(model, observer_name).to(device) + if isinstance(obj, CustomModuleQuantizeHandler): + custom_module = self.modules[node.target] + observed_custom_module_class = \ + get_observed_custom_module_class(type(custom_module)) + observed_custom_module = \ + observed_custom_module_class.from_float(custom_module) + mark_observed_custom_module(observed_custom_module, type(custom_module)) + parent_name, name = _parent_name(node.target) + setattr(self.modules[parent_name], name, observed_custom_module) + # don't need to insert observer for output in dynamic quantization if self.is_dynamic_quant: continue + # inserting observers for output of observed module, or mark the output + # as observed if isinstance(obj, CopyNode): assert node.op in [ 'call_module', @@ -355,6 +374,7 @@ def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False): self.modules = dict(model.named_modules()) matches = self._find_matches(model.graph, self.modules, self.patterns) + quants = self._find_quants(model.graph, matches) self.quantized_graph = Graph() env = {} @@ -619,6 +639,16 @@ def record_match(pattern, node, matched): all_matched.add(n.name) # break after finding the first match break + + # add custom module instances to the match result + for node in graph.nodes: + if node.op == 'call_module' and \ + (is_custom_module_class(type(self.modules[node.target])) or + is_observed_custom_module(self.modules[node.target])): + custom_module_qconfig = self.qconfig_map[node.name] + match_map[node.name] = ( + node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig) + return match_map def _find_quants(self, graph, matches): From 2a37f3fd2f74e2d10f3440e6dfef2d5389caab62 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 22 Sep 2020 17:24:54 -0700 Subject: [PATCH 030/449] Relax CUDA architecture check (#45130) Summary: NVIDIA GPUs are binary compatible within major compute capability revision This would prevent: "GeForce RTX 3080 with CUDA capability sm_86 is not compatible with the current PyTorch installation." messages from appearing, since CUDA-11 do not support code generation for sm_85. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45130 Reviewed By: ngimel Differential Revision: D23841556 Pulled By: malfet fbshipit-source-id: bcfc9e8da63dfe62cdec06909b6c049aaed6a18a --- torch/cuda/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 53aea1141d47..e8687cad17e8 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -96,9 +96,8 @@ def _check_cubins(): supported_sm = [int(arch.split('_')[1]) for arch in arch_list if 'sm_' in arch] for idx in range(device_count()): cap_major, cap_minor = get_device_capability(idx) - capability = cap_major * 10 + cap_minor - # NVIDIA GPU compute architectures are backward compatible within 5 minor revisions versions - supported = any([capability >= sm and capability - (sm // 5) * 5 < 5 for sm in supported_sm]) + # NVIDIA GPU compute architectures are backward compatible within major version + supported = any([sm // 10 == cap_major for sm in supported_sm]) if not supported: device_name = get_device_name(idx) warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name)) From b98ac208492a421944a1ae19ef7883ab1a97bb73 Mon Sep 17 00:00:00 2001 From: "Daily, Jeff" Date: Tue, 22 Sep 2020 17:41:41 -0700 Subject: [PATCH 031/449] install ATen/native/cuda and hip headers (#45097) Summary: The ATen/native/cuda headers were copied to torch/include, but then not included in the final package. Further, add ATen/native/hip headers to the installation, as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45097 Reviewed By: mruberry Differential Revision: D23831006 Pulled By: malfet fbshipit-source-id: ab527928185faaa912fd8cab208733a9b11a097b --- aten/src/ATen/CMakeLists.txt | 3 ++- setup.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 1bcbae8abeff..5ec9d24eea39 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -78,6 +78,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp}) file(GLOB native_cuda_cpp "native/cuda/*.cpp") file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") +file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh") file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") @@ -372,7 +373,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS}) if(NOT INTERN_BUILD_MOBILE) - list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${cudnn_h} ${hip_h} ${miopen_h}) + list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h}) endif() # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake diff --git a/setup.py b/setup.py index 2a2f911e0d3d..753e2b0f14a1 100644 --- a/setup.py +++ b/setup.py @@ -776,6 +776,10 @@ def print_box(msg): 'include/ATen/detail/*.h', 'include/ATen/native/*.h', 'include/ATen/native/cpu/*.h', + 'include/ATen/native/cuda/*.h', + 'include/ATen/native/cuda/*.cuh', + 'include/ATen/native/hip/*.h', + 'include/ATen/native/hip/*.cuh', 'include/ATen/native/quantized/*.h', 'include/ATen/native/quantized/cpu/*.h', 'include/ATen/quantized/*.h', From c0267c68454cf469d760c0eb3e952c1cb5f63af5 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Tue, 22 Sep 2020 17:47:28 -0700 Subject: [PATCH 032/449] [caffe2] Support data types in shape hints (#45110) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45110 A recent change in DSNN quantizes the ad embedding to 8 bits. Ad embeddings are part of the inputs to the DSNN merge net. To correctly pass shape hints of input tensors including quantized ad embeddings, we need to be able to annotate the data types in shape hints. A bit on the corner cases, if type is omitted or not a valid type, e.g., white spaces, instead of throwing an exception, I decided to return the default type, float. Test Plan: ``` buck test caffe2/caffe2/fb/opt:shape_info_utils_test ``` Reviewed By: yinghai Differential Revision: D23834091 fbshipit-source-id: 5e072144a7a7ff4b5126b618062dfc4041851dd3 --- caffe2/opt/shape_info.cc | 75 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc index 0ff55693395f..dfcdeb0356bd 100644 --- a/caffe2/opt/shape_info.cc +++ b/caffe2/opt/shape_info.cc @@ -5,6 +5,63 @@ namespace caffe2 { +namespace { +bool isNumber(const std::string& s) { + bool empty = true; + for (const char c : s) { + if (std::isalpha(c)) { + return false; + } + if (!std::isspace(c)) { + empty = false; + } + } + return !empty; +} + +std::string toLower(const std::string& s) { + std::string t; + t.resize(s.size()); + for (size_t i = 0; i < t.size(); i++) { + t[i] = std::tolower(s[i]); + } + return t; +} + +TensorProto_DataType toTensorProtoDataType(const std::string& in) { + std::string s = toLower(in); + if (s == "uint8") { + return TensorProto_DataType_UINT8; + } else if (s == "int8") { + return TensorProto_DataType_INT8; + } else if (s == "uint16") { + return TensorProto_DataType_UINT16; + } else if (s == "int16") { + return TensorProto_DataType_INT16; + } else if (s == "int32") { + return TensorProto_DataType_INT32; + } else if (s == "int64") { + return TensorProto_DataType_INT64; + } else if (s == "float16" || s == "half") { + return TensorProto_DataType_FLOAT16; + } else if (s == "float") { + return TensorProto_DataType_FLOAT; + } else if (s == "double") { + return TensorProto_DataType_DOUBLE; + } else if (s == "byte") { + return TensorProto_DataType_BYTE; + } else if (s == "string") { + return TensorProto_DataType_STRING; + } else if (s == "bool") { + return TensorProto_DataType_BOOL; + } else if (s == "hash") { + return TensorProto_DataType_ZERO_COLLISION_HASH; + } + // return default data type, float + return TensorProto_DataType_FLOAT; +} +} // namespace + ShapeInfo getShapeInfoFromBlob(const Blob* blob) { ShapeInfo shape_info; shape_info.shape = GetTensorShapeOfBlob(blob); @@ -138,14 +195,24 @@ void parseShapeInfoMapFromString( const auto& name = kv[0]; TensorShape shape; - if (name.find("int8") != std::string::npos) { - shape.set_data_type(TensorProto_DataType_UINT8); + size_t size = kv.size(); + CAFFE_ENFORCE_GT(size, 1); + if (!isNumber(kv[size - 1])) { + // last value is the type + shape.set_data_type(toTensorProtoDataType(kv[size - 1])); + size--; } else { - shape.set_data_type(TensorProto_DataType_FLOAT); + if (name.find("int8") != std::string::npos) { + // Kept for backwards compatibility. + // Set type explicitly to overwrite it. + shape.set_data_type(TensorProto_DataType_UINT8); + } else { + shape.set_data_type(TensorProto_DataType_FLOAT); + } } bool valid = true; - for (int i = 1; i < kv.size(); i++) { + for (int i = 1; i < size; i++) { auto dim = kv[i]; try { shape.add_dims(std::stoi(dim)); From ebde5a80bb0bdb30acb83124d7b326644ae76508 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Tue, 22 Sep 2020 18:15:56 -0700 Subject: [PATCH 033/449] [tensorexpr] Add flag to fuse with unknown shapes (#44401) Summary: This flag simply allows users to get fusion groups that will *eventually* have shapes (such that `getOperation` is a valid). This is useful for doing early analysis and compiling just in time. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44401 Reviewed By: ZolotukhinM Differential Revision: D23656140 Pulled By: bwasti fbshipit-source-id: 9a26c202752399d1932ad7d69f21c88081ffc1e5 --- test/cpp/tensorexpr/test_te_fuser_pass.cpp | 21 ++++++++++++++++++++- test/cpp/tensorexpr/tests.h | 1 + torch/csrc/jit/passes/tensorexpr_fuser.cpp | 20 +++++++++++++++----- torch/csrc/jit/passes/tensorexpr_fuser.h | 8 +++++++- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 680311685375..826cf7209346 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -151,7 +151,7 @@ void testFuserPass_UnknownShapes() { %y : Tensor): %a : Tensor = aten::mul(%x, %y) %b : Tensor = aten::mul(%x, %a) - return (%a))IR"; + return (%b))IR"; auto g = std::make_shared(); torch::jit::parseIR(graph_string, g.get()); @@ -311,5 +311,24 @@ void testFuserPass_MergeGroups() { ->run(*g); } +void testFuserPass_UnknownShapesIgnored() { + WithCPUFuser cf; + KernelScope kernel_scope; + const auto graph_string = R"IR( + graph(%x : Float(device=cpu), + %y : Float(device=cpu)): + %a : Float(device=cpu) = aten::mul(%x, %y) + %b : Float(device=cpu) = aten::mul(%x, %a) + return (%b))IR"; + auto g = std::make_shared(); + torch::jit::parseIR(graph_string, g.get()); + + g->lint(); + FuseTensorExprs(g, /* min_group_size= */ 2, /* disable_shape_checks= */ true); + + // Test that we are generating fusion groups even though shapes are not known + testing::FileCheck().check("prim::TensorExprGroup")->run(*g); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index 20206a348d25..c38a368af13c 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -291,6 +291,7 @@ namespace jit { _(FuserPass_0DimInput) \ _(FuserPass_UnfusibleDevice) \ _(FuserPass_UnknownShapes) \ + _(FuserPass_UnknownShapesIgnored) \ _(FuserPass_Multidevice) \ _(FuserPass_MergeGroups) \ _(TrainBasic) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 02b4861eabfe..4d98110d3975 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -284,8 +284,13 @@ void RemoveTensorTypeSpecializations(std::shared_ptr& graph) { class TensorExprFuser { public: - TensorExprFuser(std::shared_ptr graph, size_t min_group_size) - : graph_(std::move(graph)), min_group_size_(min_group_size) {} + TensorExprFuser( + std::shared_ptr graph, + size_t min_group_size, + bool disable_shape_checks) + : graph_(std::move(graph)), + min_group_size_(min_group_size), + disable_shape_checks_(disable_shape_checks) {} void run() { aliasDb_ = torch::make_unique(graph_); @@ -606,7 +611,7 @@ class TensorExprFuser { bool canHandle(Node* node) { REQ(node->kind() != prim::Constant); - REQ(allShapesAreKnown(node)); + REQ(disable_shape_checks_ || allShapesAreKnown(node)); REQ(isFusableOnDevice(node)); // Don't include nodes whose inputs are tensor constants - we cannot handle @@ -836,9 +841,14 @@ class TensorExprFuser { // Minimal size of a fusion group size_t min_group_size_; + // If true, shapes are ignored + bool disable_shape_checks_; }; -void FuseTensorExprs(std::shared_ptr& graph, size_t min_group_size) { +void FuseTensorExprs( + std::shared_ptr& graph, + size_t min_group_size, + bool disable_shape_checks) { GRAPH_DUMP("Before TExprFuser: ", graph); // Temporary change for Block code generation. @@ -849,7 +859,7 @@ void FuseTensorExprs(std::shared_ptr& graph, size_t min_group_size) { // Get rid of dead code so that we don't waste effort fusing it. EliminateDeadCode(graph); - TensorExprFuser fuser(graph, min_group_size); + TensorExprFuser fuser(graph, min_group_size, disable_shape_checks); fuser.run(); EliminateCommonSubexpression(graph); diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h index db2ee0482960..a99cc88ef439 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.h +++ b/torch/csrc/jit/passes/tensorexpr_fuser.h @@ -10,9 +10,15 @@ namespace jit { struct Graph; // Run TensorExpressions-based fuser. +// +// If shape checks are disabled it is the responsibilty of +// the caller to ensure that the resultant subgraph is correctly +// annotated with shapes by the time "getOperation" is called +// on the node. TORCH_API void FuseTensorExprs( std::shared_ptr& graph, - size_t min_group_size = 2); + size_t min_group_size = 2, + bool disable_shape_checks = false); TORCH_API void setTensorExprFuserEnabled(bool val); TORCH_API bool tensorExprFuserEnabled(); From e045119956aa8ed07e293714fd674bcff6251d69 Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Tue, 22 Sep 2020 18:35:55 -0700 Subject: [PATCH 034/449] [JIT] Add default arguments for class types (#45098) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45098 **Summary** This commit adds support for default arguments in methods of class types. Similar to how default arguments are supported for regular script functions and methods on scripted modules, default values are retrieved from the definition of a TorchScript class in Python as Python objects, converted to IValues, and then attached to the schemas of already compiled class methods. **Test Plan** This commit adds a set of new tests to TestClassType to test default arguments. **Fixes** This commit fixes #42562. Test Plan: Imported from OSS Reviewed By: gmagogsfm Differential Revision: D23844769 Pulled By: SplitInfinity fbshipit-source-id: ceedff7703bf9ede8bd07b3abcb44a0f654936bd --- test/jit/test_class_type.py | 102 +++++++++++++++++++++++++- torch/_C/__init__.pyi.in | 1 + torch/csrc/jit/python/script_init.cpp | 26 +++++++ torch/jit/_script.py | 3 +- torch/jit/frontend.py | 26 +++++++ 5 files changed, 156 insertions(+), 2 deletions(-) diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py index c71be6ac1d9f..3fcd89347091 100644 --- a/test/jit/test_class_type.py +++ b/test/jit/test_class_type.py @@ -13,7 +13,7 @@ from torch.testing._internal.jit_utils import JitTestCase import torch.testing._internal.jit_utils from torch.testing._internal.common_utils import IS_SANDCASTLE -from typing import List, Tuple, Iterable +from typing import List, Tuple, Iterable, Optional, Dict if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" @@ -1020,6 +1020,106 @@ def foo(): y.my_list = new_list return y + def test_default_args(self): + """ + Test that methods on class types can have default arguments. + """ + @torch.jit.script + class ClassWithDefaultArgs: + def __init__( + self, + a: int = 1, + b: Optional[List[int]] = None, + c: Tuple[int, int, int] = (1, 2, 3), + d: Optional[Dict[int, int]] = None, + e: Optional[str] = None, + ): + self.int = a + self.tup = c + self.str = e + + self.list = [1, 2, 3] + if b is not None: + self.list = b + + self.dict = {1: 2, 3: 4} + if d is not None: + self.dict = d + + def add(self, b: int, scale: float = 1.0) -> float: + return self.int * scale + b + + def all_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs() + return obj.int + obj.list[2] + obj.tup[1] + + def some_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs(b=[5, 6, 7]) + return obj.int + obj.list[2] + obj.dict[1] + + def override_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs(3, [9, 10, 11], (12, 13, 14), {3: 4}, "str") + s: int = obj.int + + for x in obj.list: + s += x + + for y in obj.tup: + s += y + + s += obj.dict[3] + + st = obj.str + if st is not None: + s += len(st) + + return s + + def method_defaults() -> float: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs() + return obj.add(3) + obj.add(3, 0.25) + + self.checkScript(all_defaults, ()) + self.checkScript(some_defaults, ()) + self.checkScript(override_defaults, ()) + self.checkScript(method_defaults, ()) + + # The constructor of this class below has some arguments without default values. + class ClassWithSomeDefaultArgs: # noqa: B903 + def __init__( + self, + a: int, + b: int = 1, + ): + self.a = a + self.b = b + + def default_b() -> int: + obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1) + return obj.a + obj.b + + def set_b() -> int: + obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1, 4) + return obj.a + obj.b + + self.checkScript(default_b, ()) + self.checkScript(set_b, ()) + + # The constructor of this class below has mutable arguments. This should throw + # an error. + class ClassWithMutableArgs: # noqa: B903 + def __init__( + self, + a: List[int] = [1, 2, 3], # noqa: B006 + ): + self.a = a + + def should_fail(): + obj: ClassWithMutableArgs = ClassWithMutableArgs() + + with self.assertRaisesRegex(RuntimeError, "Mutable default parameters are not supported"): + torch.jit.script(should_fail) + def test_staticmethod(self): """ Test static methods on class types. diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 0d48ea710fdd..41e0e887f829 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -224,6 +224,7 @@ def _jit_script_compile( def _jit_script_class_compile( qual_name: str, definition: ClassDef, + defaults: Dict[str, Dict[str, Any]], rcb: ResolutionCallback, ): ... def _parse_source_def(src: str) -> Def: ... diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp index 5ed9ba9dc0a7..95d041fe315b 100644 --- a/torch/csrc/jit/python/script_init.cpp +++ b/torch/csrc/jit/python/script_init.cpp @@ -53,6 +53,7 @@ using ::c10::FunctionSchema; using ResolutionCallback = std::function; using FunctionDefaults = std::unordered_map; +using ClassMethodDefaults = std::unordered_map; namespace { @@ -1301,6 +1302,7 @@ void initJitScriptBindings(PyObject* module) { "_jit_script_class_compile", [](const std::string& qualifiedName, const ClassDef& classDef, + const ClassMethodDefaults& defaults, ResolutionCallback rcb) { C10_LOG_API_USAGE_ONCE("torch.script.class"); if (classDef.superclass().present()) { @@ -1339,6 +1341,30 @@ void initJitScriptBindings(PyObject* module) { const auto self = SimpleSelf(classType); cu->define(classname, props, propRcbs, methodDefs, methodRcbs, &self); + + // Stitch in default arguments for methods. Properties don't need to be + // considered since there is no way to invoke setters without passing in + // a value. + auto defs_it = methodDefs.begin(); + while (defs_it != methodDefs.end()) { + auto def_name = (*defs_it).name().name(); + // If the method is not in the defaults map, assume there are + // no default arguments for it. + auto default_it = defaults.find(def_name); + if (default_it == defaults.end()) { + continue; + } + + const auto method_name = + QualifiedName(classname, (*defs_it).name().name()); + auto& method = cu->get_function(method_name); + method.setSchema(getSchemaWithNameAndDefaults( + defs_it->range(), + method.getSchema(), + at::nullopt, + default_it->second)); + ++defs_it; + } }); m.def( "_jit_script_interface_compile", diff --git a/torch/jit/_script.py b/torch/jit/_script.py index f5969dbaf030..fb0465288e3f 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -60,7 +60,8 @@ def _is_new_style_class(cls): def _compile_and_register_class(obj, rcb, qualified_name): ast = get_jit_class_def(obj, obj.__name__) - torch._C._jit_script_class_compile(qualified_name, ast, rcb) + defaults = torch.jit.frontend.get_default_args_for_class(obj) + torch._C._jit_script_class_compile(qualified_name, ast, defaults, rcb) torch.jit._state._add_script_class(obj, qualified_name) diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 2a6dfb498986..4cfba50d0466 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -305,6 +305,32 @@ def get_default_args(fn): } +def get_default_args_for_class(cls): + """ + Get default arguments for all methods in a class (except for static methods). + + Args: + cls: type - The class type to inspect for default arguments. + Returns: + A Dict[str, Dict[str, Any]] which maps each method name to a Dict[str, Any] + that maps each argument name to its default value. + """ + # Get methods (except static methods because those are compiled separately as + # if they were independent script functions). + methods = inspect.getmembers( + cls, + predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m)) + and not is_static_fn(cls, m.__name__) + and m.__name__ in cls.__dict__ + ) + + # Get method defaults. Property defaults do not need to be considered + # because setters cannot be invoked without a value. + defaults = {method_name: get_default_args(method_impl) for method_name, method_impl in methods} + + return defaults + + class WithItemBuilder(Builder): @staticmethod def build_withitem(ctx, item): From f575df201f290fba6e8db6d8581f57bc9ba9b07f Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 22 Sep 2020 19:35:45 -0700 Subject: [PATCH 035/449] [quant][graphmode][jit][api] Expose preserved_attrs from finalize to convert_jit (#44490) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44490 Test Plan: Imported from OSS Reviewed By: z-a-f Differential Revision: D23631142 fbshipit-source-id: f0913f0cb4576067e2a7288326024942d12e0ae0 --- torch/csrc/jit/passes/quantization/finalize.cpp | 7 +++++-- torch/csrc/jit/passes/quantization/finalize.h | 4 +++- torch/csrc/jit/python/init.cpp | 9 ++++++--- torch/quantization/quantize_jit.py | 15 +++++++++------ 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp index 3d0d9a6eff6c..635c02728f6b 100644 --- a/torch/csrc/jit/passes/quantization/finalize.cpp +++ b/torch/csrc/jit/passes/quantization/finalize.cpp @@ -73,12 +73,15 @@ void FoldQuantizedPrepackingOps(Module& module) { PrePackingOpsFolder(module, filter_fn, "quantized"); } -Module Finalize(Module& module, QuantType quant_type) { +Module Finalize( + Module& module, + QuantType quant_type, + const std::vector& preserved_attrs) { auto graph = module.get_method("forward").graph(); InsertPrepackUnpack(graph); GRAPH_DUMP("Before QuantFusion:", graph); QuantFusion(graph, quant_type); - auto frozen = freeze_module(module); + auto frozen = freeze_module(module, preserved_attrs); FoldQuantizedPrepackingOps(frozen); return frozen; } diff --git a/torch/csrc/jit/passes/quantization/finalize.h b/torch/csrc/jit/passes/quantization/finalize.h index 1de65dcb20e4..062d1e24251e 100644 --- a/torch/csrc/jit/passes/quantization/finalize.h +++ b/torch/csrc/jit/passes/quantization/finalize.h @@ -49,7 +49,9 @@ TORCH_API void InsertPrepackUnpack(Module& module); TORCH_API script::Module Finalize( script::Module& module, - QuantType quant_type = QuantType::STATIC); + QuantType quant_type = QuantType::STATIC, + const std::vector& preserved_attrs = + std::vector()); TORCH_API void FoldQuantizedPrepackingOps(Module& module); diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 052b22a78917..db866704aa97 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -284,12 +284,15 @@ void initJITBindings(PyObject* module) { [](Module& module) { SwapFunctionalLinear(module); }) .def( "_jit_pass_quant_finalize", - [](Module& module, int quant_type_int) { + [](Module& module, + int quant_type_int, + const std::vector& preserved_attrs) { auto quant_type = static_cast(quant_type_int); - return Finalize(module, quant_type); + return Finalize(module, quant_type, preserved_attrs); }, py::arg("module"), - py::arg("quant_type_int") = 1) + py::arg("quant_type_int") = 1, + py::arg("preserved_attrs") = std::vector()) .def( "_jit_pass_pattern_based_rewrite", [](const Module& m) { return PatternBasedRewrite(m); }) diff --git a/torch/quantization/quantize_jit.py b/torch/quantization/quantize_jit.py index 130f0297357c..ef6792d521f6 100644 --- a/torch/quantization/quantize_jit.py +++ b/torch/quantization/quantize_jit.py @@ -67,7 +67,8 @@ def prepare_jit(model, qconfig_dict, inplace=False): def prepare_dynamic_jit(model, qconfig_dict, inplace=False): return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC) -def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC): +def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC, + preserved_attrs=None): _check_is_script_module(model) model.eval() model_c = model._c @@ -76,18 +77,20 @@ def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC) # Moving model parameters to CPU since quantized operators # are only supported on CPU right now model.cpu() - model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type) + if preserved_attrs is None: + preserved_attrs = [] + model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type, preserved_attrs) if inplace: model._reconstruct(model_c) else: model = wrap_cpp_module(model_c) return model -def convert_jit(model, inplace=False, debug=False): - return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC) +def convert_jit(model, inplace=False, debug=False, preserved_attrs=None): + return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC, preserved_attrs=preserved_attrs) -def convert_dynamic_jit(model, inplace=False, debug=False): - return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC) +def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None): + return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC, preserved_attrs=preserved_attrs) def _quantize_jit(model, qconfig_dict, run_fn=None, run_args=None, inplace=False, debug=False, quant_type=QuantType.STATIC): # Always do inplace convert because the Tensor is already From 666223df46fbc271a694293db3b4465271717f34 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Tue, 22 Sep 2020 19:42:28 -0700 Subject: [PATCH 036/449] [jit] gtestify test_argument_spec.cpp (#45019) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45019 See https://github.com/pytorch/pytorch/pull/45018 for context. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D23802298 Pulled By: suo fbshipit-source-id: 0e36d095d4d81dcd5ebe6d56b3dc469d6d5482d0 --- test/cpp/jit/test_argument_spec.cpp | 52 ++++++++++++++++------------- test/cpp/jit/test_base.h | 5 +++ test/cpp/jit/tests.h | 4 --- tools/build_variables.bzl | 44 ------------------------ 4 files changed, 34 insertions(+), 71 deletions(-) diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp index 01e27caac05f..bf40761fc468 100644 --- a/test/cpp/jit/test_argument_spec.cpp +++ b/test/cpp/jit/test_argument_spec.cpp @@ -1,3 +1,5 @@ +#include + #include #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/runtime/argument_spec.h" @@ -5,6 +7,8 @@ namespace torch { namespace jit { +namespace { + int device(const autograd::Variable& v) { return v.device().is_cuda() ? v.get_device() : -1; } @@ -38,8 +42,9 @@ autograd::Variable var( autograd::Variable undef() { return autograd::Variable(); } +} // namespace -void testCompleteArgumentSpec() { +TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) { auto const CF = at::CPU(at::kFloat); auto const CD = at::CPU(at::kDouble); auto const GF = at::CUDA(at::kFloat); @@ -94,34 +99,35 @@ void testCompleteArgumentSpec() { ASSERT_EQ(with_const.at(2).sizes().size(), 2); } -size_t hashCode(const TensorTypePtr& ptr) { - return std::hash()(*ptr.get()); -} +// TODO: this test was disabled for unknown reasons and doesn't run. +// static size_t hashCode(const TensorTypePtr& ptr) { +// return std::hash()(*ptr.get()); +// } -void testProfiledTensorTypeHashing() { - c10::VaryingShape vs(c10::optional{}); - auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false); - auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false); - ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2)); +// TEST(ArgumentSpecTest, VaryingShape) { +// c10::VaryingShape vs(c10::optional{}); +// auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false); +// auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false); +// ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2)); - c10::VaryingShape vs22(std::vector{2, 2}); - auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false); - auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false); - ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2)); +// c10::VaryingShape vs22(std::vector{2, 2}); +// auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false); +// auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false); +// ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2)); - c10::VaryingShape vs23(std::vector{2, 3}); - auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false); - ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2)); +// c10::VaryingShape vs23(std::vector{2, 3}); +// auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false); +// ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2)); - auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true); - auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true); - ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true)); +// auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true); +// auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true); +// ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true)); - auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false); - ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false)); -} +// auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false); +// ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false)); +// } -void testArgumentSpec() { +TEST(ArgumentSpecTest, Basic_CUDA) { auto& CF = at::CPU(at::kFloat); auto& CD = at::CPU(at::kDouble); auto& GF = at::CUDA(at::kFloat); diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h index 54a59e445e95..25f9e9f36cde 100644 --- a/test/cpp/jit/test_base.h +++ b/test/cpp/jit/test_base.h @@ -10,6 +10,10 @@ #include #else #include "c10/util/Exception.h" +// Temporary: we are going to remove these polyfills entirely. +// But for now avoid redefining them if they are already defined in gtest. +// (ASSERT_EQ is a proxy for whether gtest is already present) +#ifndef ASSERT_EQ #define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y)) #define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y)) #define ASSERT_TRUE TORCH_INTERNAL_ASSERT @@ -31,6 +35,7 @@ } \ ASSERT_TRUE(threw); \ } +#endif // ndef(ASSERT_EQ) #endif // defined(USE_GTEST) diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index df45054edc43..452156fc052b 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -113,8 +113,6 @@ namespace jit { #if defined(USE_CUDA) #define TH_FORALL_TESTS_CUDA(_) \ - _(ArgumentSpec) \ - _(CompleteArgumentSpec) \ _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ @@ -220,8 +218,6 @@ namespace jit { _(GPU_FusionThreadPredicate) #else #define TH_FORALL_TESTS_CUDA(_) \ - _(ArgumentSpec) \ - _(CompleteArgumentSpec) \ _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 3cc3585aa555..3f5126358804 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -575,48 +575,4 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): _libtorch_python_sources.extend(libtorch_python_core_sources) _libtorch_python_sources.extend(libtorch_python_distributed_sources) - _libtorch_python_sources.extend([ - "test/cpp/jit/torch_python_test.cpp", - "test/cpp/tensorexpr/padded_buffer.cpp", - "test/cpp/jit/test_alias_analysis.cpp", - "test/cpp/jit/test_argument_spec.cpp", - "test/cpp/jit/test_autodiff.cpp", - "test/cpp/jit/test_backend.cpp", - "test/cpp/jit/test_base.cpp", - "test/cpp/jit/test_class_import.cpp", - "test/cpp/jit/test_class_parser.cpp", - "test/cpp/jit/test_class_type.cpp", - "test/cpp/jit/test_code_template.cpp", - "test/cpp/jit/test_constant_pooling.cpp", - "test/cpp/jit/test_cleanup_passes.cpp", - "test/cpp/jit/test_create_autodiff_subgraphs.cpp", - "test/cpp/jit/test_custom_class.cpp", - "test/cpp/jit/test_custom_operators.cpp", - "test/cpp/jit/test_dce.cpp", - "test/cpp/jit/test_fuser.cpp", - "test/cpp/jit/test_gpu.cpp", - "test/cpp/jit/test_graph_executor.cpp", - "test/cpp/jit/test_inliner.cpp", - "test/cpp/jit/test_interface.cpp", - "test/cpp/jit/test_interpreter.cpp", - "test/cpp/jit/test_ir.cpp", - "test/cpp/jit/test_irparser.cpp", - "test/cpp/jit/test_jit_type.cpp", - "test/cpp/jit/test_lite_interpreter.cpp", - "test/cpp/jit/test_lite_trainer.cpp", - "test/cpp/jit/test_misc.cpp", - "test/cpp/jit/test_mobile_type_parser.cpp", - "test/cpp/jit/test_module_api.cpp", - "test/cpp/jit/test_peephole_optimize.cpp", - "test/cpp/jit/test_qualified_name.cpp", - "test/cpp/jit/test_save_load.cpp", - "test/cpp/jit/test_schema_matching.cpp", - "test/cpp/jit/test_subgraph_matcher.cpp", - "test/cpp/jit/test_subgraph_rewriter.cpp", - "test/cpp/jit/test_subgraph_utils.cpp", - "test/cpp/jit/test_utils.cpp", - ]) - - _libtorch_python_sources.extend(native.glob(["test/cpp/tensorexpr/test_*.cpp"])) - return _libtorch_python_sources From 67a19fecef1605267ea7581e67eb6a1f74b4842c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 22 Sep 2020 20:17:23 -0700 Subject: [PATCH 037/449] CUDA BFloat16 pooling (#45151) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45151 Reviewed By: ailzhang Differential Revision: D23854056 Pulled By: ngimel fbshipit-source-id: 32f0835218c2602a09654a9ac2d161c4eb360f90 --- aten/src/ATen/native/cuda/DilatedMaxPool2d.cu | 284 +++++++++--------- aten/src/ATen/native/cuda/DilatedMaxPool3d.cu | 50 ++- test/test_nn.py | 17 +- 3 files changed, 172 insertions(+), 179 deletions(-) diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index c629dfc4030c..3e0e70c01952 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -366,70 +366,68 @@ void max_pool2d_with_indices_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] { - using accscalar_t = acc_type; - - scalar_t *output_data = output.data_ptr(); - scalar_t *input_data = input.data_ptr(); - int64_t *indices_data = indices.data_ptr(); - - switch (memory_format) { - case MemoryFormat::ChannelsLast: { - const int max_threads = std::min( - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); - int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; - int block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); - int block_y = std::min( - maxThreadsDim[1], std::min(lastPow2(outputWidth), max_threads / block_x)); - int block_z = std::min( - maxThreadsDim[2], std::min(lastPow2(outputHeight), max_threads / block_x / block_y)); - block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); - const dim3 block(block_x, block_y, block_z); - - int kernel_stride_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * 4); - int kernel_size_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * kernel_stride_C); - - int grid_x = nbatch*kernel_stride_C; - int grid_y = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[1], - cuda::ATenCeilDiv(safe_downcast(outputWidth), block_y*BLOCK_STRIDE)); - int grid_z = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[2], - cuda::ATenCeilDiv(safe_downcast(outputHeight), block_z*BLOCK_STRIDE)); - const dim3 grid(grid_x, grid_y, grid_z); - - size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); - AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); - - max_pool_forward_nhwc - <<>>( - input_data, nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - in_stride_n, in_stride_c, - in_stride_h, in_stride_w, - kernel_stride_C, kernel_size_C, - output_data, indices_data); - break; - } - case MemoryFormat::Contiguous: { - const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, - BLOCK_THREADS); - max_pool_forward_nchw - <<>>( - count, input_data, - nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - output_data, indices_data); - break; - } - default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + using accscalar_t = acc_type; + + scalar_t *output_data = output.data_ptr(); + scalar_t *input_data = input.data_ptr(); + int64_t *indices_data = indices.data_ptr(); + + switch (memory_format) { + case MemoryFormat::ChannelsLast: { + const int max_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); + int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; + int block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); + int block_y = std::min( + maxThreadsDim[1], std::min(lastPow2(outputWidth), max_threads / block_x)); + int block_z = std::min( + maxThreadsDim[2], std::min(lastPow2(outputHeight), max_threads / block_x / block_y)); + block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); + const dim3 block(block_x, block_y, block_z); + + int kernel_stride_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * 4); + int kernel_size_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * kernel_stride_C); + + int grid_x = nbatch*kernel_stride_C; + int grid_y = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[1], + cuda::ATenCeilDiv(safe_downcast(outputWidth), block_y*BLOCK_STRIDE)); + int grid_z = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[2], + cuda::ATenCeilDiv(safe_downcast(outputHeight), block_z*BLOCK_STRIDE)); + const dim3 grid(grid_x, grid_y, grid_z); + + size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); + AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); + + max_pool_forward_nhwc + <<>>( + input_data, nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + in_stride_n, in_stride_c, + in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + output_data, indices_data); + break; } - }); + case MemoryFormat::Contiguous: { + const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + max_pool_forward_nchw + <<>>( + count, input_data, + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + break; + } + default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } } ); @@ -532,88 +530,86 @@ void max_pool2d_with_indices_backward_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] { - using accscalar_t = acc_type; - - scalar_t *gradOutput_data = gradOutput.data_ptr(); - scalar_t *gradInput_data = gradInput.data_ptr(); - int64_t *indices_data = indices.data_ptr(); - - switch (memory_format) { - case MemoryFormat::ChannelsLast: { - const int max_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); - int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; - int block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); - int block_y = std::min( - maxThreadsDim[1], std::min(lastPow2(inputWidth), max_threads / block_x)); - int block_z = std::min( - maxThreadsDim[2], std::min(lastPow2(inputHeight), max_threads / block_x / block_y)); - block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); - const dim3 block(block_x, block_y, block_z); - - int kernel_stride_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * 4); - int kernel_size_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * kernel_stride_C); - - int grid_x = nbatch*kernel_stride_C; - int grid_y = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[1], - cuda::ATenCeilDiv(safe_downcast(inputWidth), block_y*BLOCK_STRIDE)); - int grid_z = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[2], - cuda::ATenCeilDiv(safe_downcast(inputHeight), block_z*BLOCK_STRIDE)); - const dim3 grid(grid_x, grid_y, grid_z); - - size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t); - AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); - - // The backward kernel is launched on input instead output. - // If it is launched on output layer, atomic_add would not provide much benefit on FP16. - // Please check comments at https://github.com/pytorch/pytorch/pull/34519. - max_pool_backward_nhwc - <<>>( - count, - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - out_stride_c, out_stride_h, out_stride_w, - in_stride_n, in_stride_c, - in_stride_h, in_stride_w, - kernel_stride_C, kernel_size_C, - gradInput_data); - break; - } - case MemoryFormat::Contiguous: { - int imgcount = inputWidth * inputHeight; - dim3 grid; - const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; - grid.x = blocks; - grid.y = nbatch; - uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - if (maxGridY < grid.y) grid.y = maxGridY; - grid.z = nInputPlane; - uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; - if (maxGridZ < grid.z) grid.z = maxGridZ; - - max_pool_backward_nchw - <<>>( - count, - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - gradInput_data); - break; - } - default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + using accscalar_t = acc_type; + + scalar_t *gradOutput_data = gradOutput.data_ptr(); + scalar_t *gradInput_data = gradInput.data_ptr(); + int64_t *indices_data = indices.data_ptr(); + + switch (memory_format) { + case MemoryFormat::ChannelsLast: { + const int max_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); + int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; + int block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); + int block_y = std::min( + maxThreadsDim[1], std::min(lastPow2(inputWidth), max_threads / block_x)); + int block_z = std::min( + maxThreadsDim[2], std::min(lastPow2(inputHeight), max_threads / block_x / block_y)); + block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); + const dim3 block(block_x, block_y, block_z); + + int kernel_stride_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * 4); + int kernel_size_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * kernel_stride_C); + + int grid_x = nbatch*kernel_stride_C; + int grid_y = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[1], + cuda::ATenCeilDiv(safe_downcast(inputWidth), block_y*BLOCK_STRIDE)); + int grid_z = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[2], + cuda::ATenCeilDiv(safe_downcast(inputHeight), block_z*BLOCK_STRIDE)); + const dim3 grid(grid_x, grid_y, grid_z); + + size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t); + AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); + + // The backward kernel is launched on input instead output. + // If it is launched on output layer, atomic_add would not provide much benefit on FP16. + // Please check comments at https://github.com/pytorch/pytorch/pull/34519. + max_pool_backward_nhwc + <<>>( + count, + gradOutput_data, + indices_data, + nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + out_stride_c, out_stride_h, out_stride_w, + in_stride_n, in_stride_c, + in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + gradInput_data); + break; } - }); + case MemoryFormat::Contiguous: { + int imgcount = inputWidth * inputHeight; + dim3 grid; + const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; + grid.x = blocks; + grid.y = nbatch; + uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + if (maxGridY < grid.y) grid.y = maxGridY; + grid.z = nInputPlane; + uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; + if (maxGridZ < grid.z) grid.z = maxGridZ; + + max_pool_backward_nchw + <<>>( + count, + gradOutput_data, + indices_data, + nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + break; + } + default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } } ); diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu index 2b0ba37c8880..9d72e0027007 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu @@ -276,20 +276,18 @@ void max_pool3d_with_indices_out_cuda_template( input.scalar_type(), "max_pool3d_with_indices_out_frame", [&]{ - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_out_frame", [&] { - scalar_t *input_data = work_input.data_ptr(); - int64_t totalZ = otime * nslices * nbatch; - - max_pool3d_with_indices_out_frame( - input_data, work_output, work_indices, - totalZ, - itime, iheight, iwidth, - otime, oheight, owidth, - kT, kH, kW, - dT, dH, dW, - pT, pH, pW, - dilationT, dilationH, dilationW); - }); + scalar_t *input_data = work_input.data_ptr(); + int64_t totalZ = otime * nslices * nbatch; + + max_pool3d_with_indices_out_frame( + input_data, work_output, work_indices, + totalZ, + itime, iheight, iwidth, + otime, oheight, owidth, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW); } ); } @@ -387,19 +385,17 @@ void max_pool3d_with_indices_backward_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool3d_with_indices_backward_out_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_backward_out_frame", [&] { - const int64_t totalZ = otime * nslices * nbatch; - scalar_t *grad_input_data = work_grad_input.data_ptr(); - - max_pool3d_with_indices_backward_out_frame( - grad_input_data, work_grad_output, work_indices, - totalZ, - itime, iheight, iwidth, - oheight, owidth, - dT, dH, dW, - pT, pH, pW, - dilationT, dilationH, dilationW); - }); + const int64_t totalZ = otime * nslices * nbatch; + scalar_t *grad_input_data = work_grad_input.data_ptr(); + + max_pool3d_with_indices_backward_out_frame( + grad_input_data, work_grad_output, work_indices, + totalZ, + itime, iheight, iwidth, + oheight, owidth, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW); } ); } diff --git a/test/test_nn.py b/test/test_nn.py index 00614c0cdc34..281425e26782 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -33,6 +33,7 @@ from torch.autograd.gradcheck import gradgradcheck from torch.nn import Parameter from torch.nn.parallel._functions import Broadcast +from torch.testing import get_all_fp_dtypes from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \ get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \ @@ -11552,32 +11553,32 @@ def expected_output(dim): self.assertEqual(output[0, 0, 0, 0], float("-inf")) self.assertEqual(indices[0, 0, 0, 0], 0) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype) @@ -11650,7 +11651,7 @@ def test_pooling_zero_stride(self, device): self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero", lambda: fn_module(x)) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_pool_large_size(self, device, dtype): for op in ('max', 'avg'): @@ -11664,7 +11665,7 @@ def test_pool_large_size(self, device, dtype): # check if the output shape was still computed correctly self.assertEqual(x.shape[2], res.shape[2]) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_pool_invalid_size(self, device, dtype): for op in ('max', 'avg'): From 1bd6533d60797949b599843dc5473eda0e3fce65 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 22 Sep 2020 21:13:07 -0700 Subject: [PATCH 038/449] Remove thread_local RecordFunctionGuard from profiler. (#44646) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44646 Per a discussion with ilia-cher, this is not needed anymore and removing it would make some future changes to support async RPC profiling easier. Tested by ensuring profiling tests in `test_autograd.py` still pass. ghstack-source-id: 112605618 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23683998 fbshipit-source-id: 4e49a439509884fe04d922553890ae353e3331ab --- torch/csrc/autograd/profiler.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 049f857f8bbf..9d75eea84328 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -388,9 +388,6 @@ void pushProfilingCallbacks() { const int kCUDAWarmupStart = 5; -// temp. workaround for dispatcher ::Profiler key -thread_local std::vector> g_; - } // namespace void registerCUDAMethods(CUDAStubs* stubs) { @@ -450,7 +447,6 @@ void enableProfiler(const ProfilerConfig& new_config) { c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); pushProfilingCallbacks(); - g_.emplace_back(std::make_shared()); if (new_config.state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to @@ -479,7 +475,6 @@ thread_event_lists disableProfiler() { TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, "Can't disable profiler when it's not running"); - g_.pop_back(); at::removeCallback(state_ptr->callbackHandle()); if (state_ptr->config().state == ProfilerState::NVTX) { From 70d2e4d1f6c46db2edc056c0a4c458c57e3e46f6 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 22 Sep 2020 21:13:07 -0700 Subject: [PATCH 039/449] [RPC profiling] Allow disableProfiler() to be called from another thread. (#44653) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44653 This changes the profiler per a discussion with ilia-cher offline that enables `disableProfiler()` event consolidation logic to be called from different threads (i.e. threads where the profiler was not explicitly enabled). This is needed to support the functionality enabled by D23638387 where we defer profiling event collection until executing an async callback that can execute on a different thread, to support RPC async function profiling. This is done by introducing 2 flags `cleanupTLSState` and `consolidate` which controls whether we should clean up thread local settings (we don't do this when calling `disableProfiler()` on non-main threads) and whether we should consolidate all profiled events. Backwards compatiblity is ensured since both options are true by default. Added a test in `test_misc.cpp` to test this. ghstack-source-id: 112605620 Reviewed By: mrshenli Differential Revision: D23638499 fbshipit-source-id: f5bbb0d41ef883c5e5870bc27e086b8b8908f46b --- c10/util/ThreadLocalDebugInfo.cpp | 9 +++++++ c10/util/ThreadLocalDebugInfo.h | 3 +++ test/cpp/jit/test_misc.cpp | 41 +++++++++++++++++++++++++++++++ test/cpp/jit/tests.h | 1 + torch/csrc/autograd/init.cpp | 6 ++++- torch/csrc/autograd/profiler.cpp | 19 +++++++++----- torch/csrc/autograd/profiler.h | 2 +- 7 files changed, 73 insertions(+), 8 deletions(-) diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp index a9cdc26b5934..20d473667a8d 100644 --- a/c10/util/ThreadLocalDebugInfo.cpp +++ b/c10/util/ThreadLocalDebugInfo.cpp @@ -51,6 +51,15 @@ std::shared_ptr ThreadLocalDebugInfo::_pop(DebugInfoKind kind) { return res->info_; } +/* static */ +std::shared_ptr ThreadLocalDebugInfo::_peek(DebugInfoKind kind) { + TORCH_CHECK( + debug_info && debug_info->kind_ == kind, + "Expected debug info of type ", + (size_t)kind); + return debug_info->info_; +} + DebugInfoGuard::DebugInfoGuard( DebugInfoKind kind, std::shared_ptr info) { diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h index 207abed781b0..9620cfb9fdea 100644 --- a/c10/util/ThreadLocalDebugInfo.h +++ b/c10/util/ThreadLocalDebugInfo.h @@ -46,6 +46,9 @@ class C10_API ThreadLocalDebugInfo { // Pop debug info, throws in case the last pushed // debug info is not of a given kind static std::shared_ptr _pop(DebugInfoKind kind); + // Peek debug info, throws in case the last pushed debug info is not of the + // given kind + static std::shared_ptr _peek(DebugInfoKind kind); private: std::shared_ptr info_; diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 4941c11d6cae..953d1bf42fc0 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -2184,5 +2184,46 @@ void testTLSFutureCallbacks() { } } +void testProfilerDisableInCallback() { + // cb that verifies the profiler is enabled + auto profilerEnabledCb = []() { + ASSERT_TRUE(torch::autograd::profiler::profilerEnabled()); + }; + torch::autograd::profiler::enableProfiler( + torch::autograd::profiler::ProfilerConfig( + torch::autograd::profiler::ProfilerState::CPU, false, false)); + auto s1 = c10::make_intrusive(IntType::get()); + s1->addCallback(wrapPropagateTLSState([&profilerEnabledCb] { + // Ensure the profiler is still enabled in this thread. + profilerEnabledCb(); + auto t1 = torch::ones({2, 2}); + auto t2 = torch::ones({2, 2}); + torch::add(t1, t2); + // Don't cleanup TLSState, and just consolidate. + auto thread_event_lists = + torch::autograd::profiler::disableProfiler(false, true); + // Ensure that the events from this thread are still profiled and we obtain + // the expected in events in our consolidated list when calling + // disableProfiler(). + bool found_ones = false; + bool found_add = false; + for (const auto& li : thread_event_lists) { + for (const auto& evt : li) { + if (strcmp(evt.name(), "aten::add") == 0) { + found_add = true; + } else if (strcmp(evt.name(), "aten::ones") == 0) { + found_ones = true; + } + } + } + ASSERT_TRUE(found_ones); + ASSERT_TRUE(found_add); + })); + // Disable the profiler, but do not consolidate results in the main thread. + torch::autograd::profiler::disableProfiler(true, false); + std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); }); + t.join(); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 452156fc052b..45d7f48b1f8a 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -84,6 +84,7 @@ namespace jit { _(DefaultArgTypeHinting) \ _(Futures) \ _(TLSFutureCallbacks) \ + _(ProfilerDisableInCallback) \ _(MobileTypeParser) \ _(LiteInterpreterBuiltinFunction) \ _(LiteInterpreterPrim) \ diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index c92654cf7815..69759d1948b2 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -61,7 +61,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("sequence_nr", &Event::sequence_nr); m.def("_enable_profiler", enableProfiler); - m.def("_disable_profiler", disableProfiler); + m.def( + "_disable_profiler", + disableProfiler, + py::arg("cleanup_tls_states") = true, + py::arg("consolidate") = true); m.def("_profiler_enabled", profilerEnabled); m.def("_enable_record_function", [](bool enable) { at::enableRecordFunction(enable); diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 9d75eea84328..bab21ee5a7a8 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -442,7 +442,6 @@ void enableProfiler(const ProfilerConfig& new_config) { auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); - auto state = std::make_shared(new_config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); @@ -468,21 +467,29 @@ void enableProfiler(const ProfilerConfig& new_config) { state->mark("__start_profile", false); } -thread_event_lists disableProfiler() { +thread_event_lists disableProfiler(bool cleanupTLSState, bool consolidate) { // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard - auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + std::shared_ptr state; + if (cleanupTLSState) { + state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + } else { + state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE); + } + auto state_ptr = static_cast(state.get()); TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, "Can't disable profiler when it's not running"); - at::removeCallback(state_ptr->callbackHandle()); + if (cleanupTLSState) { + at::removeCallback(state_ptr->callbackHandle()); + } - if (state_ptr->config().state == ProfilerState::NVTX) { + if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { return thread_event_lists(); } state_ptr->mark("__stop_profile"); - + // Note that this will erase the underlying events. return state_ptr->consolidate(); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 3f962eff341d..6a7c5095a071 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -341,7 +341,7 @@ using thread_event_lists = std::vector>; // NOTE: profiler mode is thread local, with automatic propagation // across thread boundary (e.g. at::launch tasks) TORCH_API void enableProfiler(const ProfilerConfig&); -TORCH_API thread_event_lists disableProfiler(); +TORCH_API thread_event_lists disableProfiler(bool cleanupTLSState = true, bool consolidate = true); // adds profiledEvents to the current thread local recorded events. Each event // will be marked with node ID given by fromNodeId. TORCH_API void addEventList(std::vector&& profiledEvents); From d4a634c2093d1a47ed0390765a7f4d4c6d70e015 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 22 Sep 2020 21:13:07 -0700 Subject: [PATCH 040/449] [RPC profiling] Don't wrap toHere() calls with profiling (#44655) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44655 Since `toHere()` does not execute operations over RPC and simply transfers the value to the local node, we don't need to enable the profiler remotely for this message. This causes unnecessary overhead and is not needed. Since `toHere` is a blocking call, we already profile the call on the local node using `RECORD_USER_SCOPE`, so this does not change the expected profiler results (validated by ensuring all remote profiling tests pass). ghstack-source-id: 112605610 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23641466 fbshipit-source-id: 109d9eb10bd7fe76122b2026aaf1c7893ad10588 --- torch/csrc/distributed/autograd/utils.cpp | 5 +++-- torch/csrc/distributed/autograd/utils.h | 3 ++- torch/csrc/distributed/rpc/rref_impl.cpp | 9 +++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp index 726cc605a913..464d8248d8a4 100644 --- a/torch/csrc/distributed/autograd/utils.cpp +++ b/torch/csrc/distributed/autograd/utils.cpp @@ -143,7 +143,8 @@ std::shared_ptr sendMessageWithAutograd( const WorkerInfo& dst, torch::distributed::rpc::Message&& wrappedRpcMsg, bool forceGradRecording, - const float rpcTimeoutSeconds) { + const float rpcTimeoutSeconds, + bool forceDisableProfiling) { auto msg = getMessageWithAutograd( dst.id_, std::move(wrappedRpcMsg), @@ -153,7 +154,7 @@ std::shared_ptr sendMessageWithAutograd( std::shared_ptr fut; // If profiler is enabled, wrap this message with profiling metadata that will // tell the remote end to process this request with the profiler enabled. - if (torch::autograd::profiler::profilerEnabled()) { + if (!forceDisableProfiling && torch::autograd::profiler::profilerEnabled()) { auto profilerConfig = torch::autograd::profiler::getProfilerConfig(); auto msgWithProfiling = getMessageWithProfiling( std::move(msg), diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h index c6316378a146..2a0a066e1a95 100644 --- a/torch/csrc/distributed/autograd/utils.h +++ b/torch/csrc/distributed/autograd/utils.h @@ -51,7 +51,8 @@ sendMessageWithAutograd( const rpc::WorkerInfo& dst, rpc::Message&& wrappedRpcMsg, bool forceGradRecording = false, - const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout); + const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout, + bool forceDisableProfiling = false); } // namespace autograd } // namespace distributed diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp index 34249172473c..6c6a377a4652 100644 --- a/torch/csrc/distributed/rpc/rref_impl.cpp +++ b/torch/csrc/distributed/rpc/rref_impl.cpp @@ -141,9 +141,6 @@ IValue UserRRef::toHere(const float timeoutSeconds) const { "to_here#({})->({})", RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_, RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_); - auto& remoteProfilerManager = - torch::distributed::rpc::RemoteProfilerManager::getInstance(); - remoteProfilerManager.setCurrentKey(toHereKey); } RECORD_USER_SCOPE(toHereKey); TORCH_CHECK( @@ -170,12 +167,16 @@ IValue UserRRef::toHere(const float timeoutSeconds) const { msgToSend = ScriptRRefFetchCall(ownerId_, rrefId()).toMessage(); } + // toHere is profiled as a blocking call, and does not execute operations on + // the remote node. Hence, don't wrap it with a profiling message since we + // don't need the profiler to be enabled remotely. auto futureResponse = autograd::sendMessageWithAutograd( *agent, agent->getWorkerInfo(ownerId_), std::move(msgToSend), true /* forceGradRecording */, - timeoutSeconds); + timeoutSeconds, + true /* forceDisableProfiling */); // TODO: we should ideally be able to interrupt this blocking wait if we check // getTimedOut() and it is true From cb75addee4dfd7604766397f8f0a294b950c4a03 Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Tue, 22 Sep 2020 21:15:15 -0700 Subject: [PATCH 041/449] torch.package - a way to package models and code (#45015) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45015 torch.package allows you to write packages of code, pickled python data, and arbitrary binary and text resources into a self-contained package. torch.package.PackageExporter writes the packages and torch.package.PackageImporter reads them. The importers can load this code in a hermetic way, such that code is loaded from the package rather than the normal python import system. This allows for the packaging of PyTorch model code and data so that it can be run on a server or used in the future for transfer learning. The code contained in packages is copied file-by-file from the original source when it is created, and the file format is a specially organized zip file. Future users of the package can unzip the package, and edit the code in order to perform custom modifications to it. The importer for packages ensures that code in the module can only be loaded from within the package, except for modules explicitly listed as external using :method:`extern_module`. The file `extern_modules` in the zip archive lists all the modules that a package externally depends on. This prevents "implicit" dependencies where the package runs locally because it is importing a locally-installed package, but then fails when the package is copied to another machine. Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D23824337 Pulled By: zdevito fbshipit-source-id: 1247c34ba9b656f9db68a83e31f2a0fbe3bea6bd --- test/module_a.py | 1 + test/namespace_b/subpackage.py | 0 test/package_a/__init__.py | 7 + test/package_a/subpackage.py | 3 + test/run_test.py | 3 +- test/test_package.py | 309 +++++++++++++++++ torch/package/__init__.py | 2 + torch/package/_custom_import_pickler.py | 78 +++++ torch/package/_importlib.py | 83 +++++ torch/package/_mock.py | 39 +++ torch/package/_mock_zipreader.py | 48 +++ torch/package/exporter.py | 435 ++++++++++++++++++++++++ torch/package/find_file_dependencies.py | 42 +++ torch/package/importer.py | 388 +++++++++++++++++++++ torch/serialization.py | 4 +- 15 files changed, 1439 insertions(+), 3 deletions(-) create mode 100644 test/module_a.py create mode 100644 test/namespace_b/subpackage.py create mode 100644 test/package_a/__init__.py create mode 100644 test/package_a/subpackage.py create mode 100644 test/test_package.py create mode 100644 torch/package/__init__.py create mode 100644 torch/package/_custom_import_pickler.py create mode 100644 torch/package/_importlib.py create mode 100644 torch/package/_mock.py create mode 100644 torch/package/_mock_zipreader.py create mode 100644 torch/package/exporter.py create mode 100644 torch/package/find_file_dependencies.py create mode 100644 torch/package/importer.py diff --git a/test/module_a.py b/test/module_a.py new file mode 100644 index 000000000000..685af9bc1569 --- /dev/null +++ b/test/module_a.py @@ -0,0 +1 @@ +result = 'module_a' diff --git a/test/namespace_b/subpackage.py b/test/namespace_b/subpackage.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/package_a/__init__.py b/test/package_a/__init__.py new file mode 100644 index 000000000000..4761b3db5e41 --- /dev/null +++ b/test/package_a/__init__.py @@ -0,0 +1,7 @@ +result = 'package_a' + +class PackageAObject: + __slots__ = ['obj'] + + def __init__(self, obj): + self.obj = obj diff --git a/test/package_a/subpackage.py b/test/package_a/subpackage.py new file mode 100644 index 000000000000..46f729d51852 --- /dev/null +++ b/test/package_a/subpackage.py @@ -0,0 +1,3 @@ +result = 'package_a.subpackage' +class PackageASubpackageObject: + pass diff --git a/test/run_test.py b/test/run_test.py index 606e20a6f723..d63fc372f9c2 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -89,7 +89,8 @@ 'test_determination', 'test_futures', 'test_fx', - 'test_functional_autograd_benchmark' + 'test_functional_autograd_benchmark', + 'test_package', ] WINDOWS_BLOCKLIST = [ diff --git a/test/test_package.py b/test/test_package.py new file mode 100644 index 000000000000..a25726a53c00 --- /dev/null +++ b/test/test_package.py @@ -0,0 +1,309 @@ +from unittest import main, skipIf +from torch.testing._internal.common_utils import TestCase, IS_WINDOWS +from tempfile import NamedTemporaryFile +from torch.package import PackageExporter, PackageImporter +from pathlib import Path +from tempfile import TemporaryDirectory +import torch +from sys import version_info + +try: + from torchvision.models import resnet18 + HAS_TORCHVISION = True +except ImportError: + HAS_TORCHVISION = False +skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision") + + + +packaging_directory = Path(__file__).parent + +class PackagingTest(TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._temporary_files = [] + + def temp(self): + t = NamedTemporaryFile() + name = t.name + if IS_WINDOWS: + t.close() # can't read an open file in windows + else: + self._temporary_files.append(t) + return name + + def tearDown(self): + for t in self._temporary_files: + t.close() + self._temporary_files = [] + + def test_saving_source(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_source_file('foo', str(packaging_directory / 'module_a.py')) + he.save_source_file('foodir', str(packaging_directory / 'package_a')) + hi = PackageImporter(filename) + foo = hi.import_module('foo') + s = hi.import_module('foodir.subpackage') + self.assertEqual(foo.result, 'module_a') + self.assertEqual(s.result, 'package_a.subpackage') + + def test_saving_string(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + src = """\ +import math +the_math = math +""" + he.save_source_string('my_mod', src) + hi = PackageImporter(filename) + m = hi.import_module('math') + import math + self.assertIs(m, math) + my_mod = hi.import_module('my_mod') + self.assertIs(my_mod.math, math) + + def test_save_module(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + import module_a + import package_a + he.save_module(module_a.__name__) + he.save_module(package_a.__name__) + hi = PackageImporter(filename) + module_a_i = hi.import_module('module_a') + self.assertEqual(module_a_i.result, 'module_a') + self.assertIsNot(module_a, module_a_i) + package_a_i = hi.import_module('package_a') + self.assertEqual(package_a_i.result, 'package_a') + self.assertIsNot(package_a_i, package_a) + + def test_pickle(self): + import package_a.subpackage + obj = package_a.subpackage.PackageASubpackageObject() + obj2 = package_a.PackageAObject(obj) + + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_pickle('obj', 'obj.pkl', obj2) + hi = PackageImporter(filename) + + # check we got dependencies + sp = hi.import_module('package_a.subpackage') + # check we didn't get other stuff + with self.assertRaises(ImportError): + hi.import_module('module_a') + + obj_loaded = hi.load_pickle('obj', 'obj.pkl') + self.assertIsNot(obj2, obj_loaded) + self.assertIsInstance(obj_loaded.obj, sp.PackageASubpackageObject) + self.assertIsNot(package_a.subpackage.PackageASubpackageObject, sp.PackageASubpackageObject) + + def test_resources(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_text('main', 'main', "my string") + he.save_binary('main', 'main_binary', "my string".encode('utf-8')) + src = """\ +import resources +t = resources.load_text('main', 'main') +b = resources.load_binary('main', 'main_binary') +""" + he.save_source_string('main', src, is_package=True) + hi = PackageImporter(filename) + m = hi.import_module('main') + self.assertEqual(m.t, "my string") + self.assertEqual(m.b, "my string".encode('utf-8')) + + def test_extern(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.extern_modules(['package_a.subpackage', 'module_a']) + he.save_module('package_a') + hi = PackageImporter(filename) + import package_a.subpackage + import module_a + + module_a_im = hi.import_module('module_a') + hi.import_module('package_a.subpackage') + package_a_im = hi.import_module('package_a') + + self.assertIs(module_a, module_a_im) + self.assertIsNot(package_a, package_a_im) + self.assertIs(package_a.subpackage, package_a_im.subpackage) + + @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature') + def test_mock(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.mock_modules(['package_a.subpackage', 'module_a']) + he.save_module('package_a') + hi = PackageImporter(filename) + import package_a.subpackage + _ = package_a.subpackage + import module_a + _ = module_a + + m = hi.import_module('package_a.subpackage') + r = m.result + with self.assertRaisesRegex(NotImplementedError, 'was mocked out'): + r() + + @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature') + def test_custom_requires(self): + filename = self.temp() + + class Custom(PackageExporter): + def require_module(self, name, dependencies): + if name == 'module_a': + self.mock_module('module_a') + elif name == 'package_a': + self.save_source_string('package_a', 'import module_a\nresult = 5\n') + else: + raise NotImplementedError('wat') + + with Custom(filename, verbose=False) as he: + he.save_source_string('main', 'import package_a\n') + + hi = PackageImporter(filename) + hi.import_module('module_a').should_be_mocked + bar = hi.import_module('package_a') + self.assertEqual(bar.result, 5) + + @skipIfNoTorchVision + def test_resnet(self): + resnet = resnet18() + + f1 = self.temp() + + # create a package that will save it along with its code + with PackageExporter(f1, verbose=False) as e: + # put the pickled resnet in the package, by default + # this will also save all the code files references by + # the objects in the pickle + e.save_pickle('model', 'model.pkl', resnet) + + # we can now load the saved model + i = PackageImporter(f1) + r2 = i.load_pickle('model', 'model.pkl') + + # test that it works + input = torch.rand(1, 3, 224, 224) + ref = resnet(input) + self.assertTrue(torch.allclose(r2(input), ref)) + + # functions exist also to get at the private modules in each package + torchvision = i.import_module('torchvision') + + f2 = self.temp() + # if we are doing transfer learning we might want to re-save + # things that were loaded from a package + with PackageExporter(f2, verbose=False) as e: + # We need to tell the exporter about any modules that + # came from imported packages so that it can resolve + # class names like torchvision.models.resnet.ResNet + # to their source code. + + e.importers.insert(0, i.import_module) + + # e.importers is a list of module importing functions + # that by default contains importlib.import_module. + # it is searched in order until the first success and + # that module is taken to be what torchvision.models.resnet + # should be in this code package. In the case of name collisions, + # such as trying to save a ResNet from two different packages, + # we take the first thing found in the path, so only ResNet objects from + # one importer will work. This avoids a bunch of name mangling in + # the source code. If you need to actually mix ResNet objects, + # we suggest reconstructing the model objects using code from a single package + # using functions like save_state_dict and load_state_dict to transfer state + # to the correct code objects. + e.save_pickle('model', 'model.pkl', r2) + + i2 = PackageImporter(f2) + r3 = i2.load_pickle('model', 'model.pkl') + self.assertTrue(torch.allclose(r3(input), ref)) + + # test we can load from a directory + import zipfile + zf = zipfile.ZipFile(f1, 'r') + + with TemporaryDirectory() as td: + zf.extractall(path=td) + iz = PackageImporter(str(Path(td) / Path(f1).name)) + r4 = iz.load_pickle('model', 'model.pkl') + self.assertTrue(torch.allclose(r4(input), ref)) + + @skipIfNoTorchVision + def test_model_save(self): + + # This example shows how you might package a model + # so that the creator of the model has flexibility about + # how they want to save it but the 'server' can always + # use the same API to load the package. + + # The convension is for each model to provide a + # 'model' package with a 'load' function that actual + # reads the model out of the archive. + + # How the load function is implemented is up to the + # the packager. + + # get our normal torchvision resnet + resnet = resnet18() + + + f1 = self.temp() + # Option 1: save by pickling the whole model + # + single-line, similar to torch.jit.save + # - more difficult to edit the code after the model is created + with PackageExporter(f1, verbose=False) as e: + e.save_pickle('model', 'pickled', resnet) + # note that this source is the same for all models in this approach + # so it can be made part of an API that just takes the model and + # packages it with this source. + src = """\ +import resources # gives you access to the importer from within the package + +# server knows to call model.load() to get the model, +# maybe in the future it passes options as arguments by convension +def load(): + return resources.load_pickle('model', 'pickled') + """ + e.save_source_string('model', src, is_package=True) + + f2 = self.temp() + # Option 2: save with state dict + # - more code to write to save/load the model + # + but this code can be edited later to adjust adapt the model later + with PackageExporter(f2, verbose=False) as e: + e.save_pickle('model', 'state_dict', resnet.state_dict()) + src = """\ +import resources # gives you access to the importer from within the package +from torchvision.models.resnet import resnet18 +def load(): + # if you want, you can later edit how resnet is constructed here + # to edit the model in the package, while still loading the original + # state dict weights + r = resnet18() + state_dict = resources.load_pickle('model', 'state_dict') + r.load_state_dict(state_dict) + return r + """ + e.save_source_string('model', src, is_package=True) + + + + # regardless of how we chose to package, we can now use the model in a server in the same way + input = torch.rand(1, 3, 224, 224) + results = [] + for m in [f1, f2]: + importer = PackageImporter(m) + the_model = importer.import_module('model').load() + r = the_model(input) + results.append(r) + + self.assertTrue(torch.allclose(*results)) + +if __name__ == '__main__': + main() diff --git a/torch/package/__init__.py b/torch/package/__init__.py new file mode 100644 index 000000000000..be7159a1836d --- /dev/null +++ b/torch/package/__init__.py @@ -0,0 +1,2 @@ +from .importer import PackageImporter +from .exporter import PackageExporter diff --git a/torch/package/_custom_import_pickler.py b/torch/package/_custom_import_pickler.py new file mode 100644 index 000000000000..fd5787b6b3e3 --- /dev/null +++ b/torch/package/_custom_import_pickler.py @@ -0,0 +1,78 @@ +from pickle import _Pickler, _getattribute, whichmodule, _extension_registry, _compat_pickle # type: ignore +from pickle import GLOBAL, STACK_GLOBAL, EXT1, EXT2, EXT4, PicklingError +from struct import pack + +class CustomImportPickler(_Pickler): + def __init__(self, import_module, *args, **kwargs): + self.import_module = import_module + super().__init__(*args, **kwargs) + + def save_global(self, obj, name=None): + # unfortunately the pickler code is factored in a way that + # forces us to copy/paste this function. The only change is marked + # CHANGED below. + write = self.write + memo = self.memo + + if name is None: + name = getattr(obj, '__qualname__', None) + if name is None: + name = obj.__name__ + + module_name = whichmodule(obj, name) + try: + # CHANGED: self.import_module rather than + # __import__ + module = self.import_module(module_name) + obj2, parent = _getattribute(module, name) + except (ImportError, KeyError, AttributeError): + raise PicklingError( + "Can't pickle %r: it's not found as %s.%s" % + (obj, module_name, name)) from None + else: + if obj2 is not obj: + raise PicklingError( + "Can't pickle %r: it's not the same object as %s.%s" % + (obj, module_name, name)) + + if self.proto >= 2: + code = _extension_registry.get((module_name, name)) + if code: + assert code > 0 + if code <= 0xff: + write(EXT1 + pack("= 3. + if self.proto >= 4: + self.save(module_name) + self.save(name) + write(STACK_GLOBAL) + elif parent is not module: + self.save_reduce(getattr, (parent, lastname)) + elif self.proto >= 3: + write(GLOBAL + bytes(module_name, "utf-8") + b'\n' + + bytes(name, "utf-8") + b'\n') + else: + if self.fix_imports: + r_name_mapping = _compat_pickle.REVERSE_NAME_MAPPING + r_import_mapping = _compat_pickle.REVERSE_IMPORT_MAPPING + if (module_name, name) in r_name_mapping: + module_name, name = r_name_mapping[(module_name, name)] + elif module_name in r_import_mapping: + module_name = r_import_mapping[module_name] + try: + write(GLOBAL + bytes(module_name, "ascii") + b'\n' + + bytes(name, "ascii") + b'\n') + except UnicodeEncodeError: + raise PicklingError( + "can't pickle global identifier '%s.%s' using " + "pickle protocol %i" % (module, name, self.proto)) from None + + self.memoize(obj) diff --git a/torch/package/_importlib.py b/torch/package/_importlib.py new file mode 100644 index 000000000000..1b521ca1a962 --- /dev/null +++ b/torch/package/_importlib.py @@ -0,0 +1,83 @@ +import _warnings +import os.path +# note: implementations +# copied from cpython's import code + + +# _zip_searchorder defines how we search for a module in the Zip +# archive: we first search for a package __init__, then for +# non-package .pyc, and .py entries. The .pyc entries +# are swapped by initzipimport() if we run in optimized mode. Also, +# '/' is replaced by path_sep there. + +_zip_searchorder = ( + ('/__init__.py', True), + ('.py', False), +) + +# Replace any occurrences of '\r\n?' in the input string with '\n'. +# This converts DOS and Mac line endings to Unix line endings. +def _normalize_line_endings(source): + source = source.replace(b'\r\n', b'\n') + source = source.replace(b'\r', b'\n') + return source + +def _resolve_name(name, package, level): + """Resolve a relative module name to an absolute one.""" + bits = package.rsplit('.', level - 1) + if len(bits) < level: + raise ValueError('attempted relative import beyond top-level package') + base = bits[0] + return '{}.{}'.format(base, name) if name else base + +def _sanity_check(name, package, level): + """Verify arguments are "sane".""" + if not isinstance(name, str): + raise TypeError('module name must be str, not {}'.format(type(name))) + if level < 0: + raise ValueError('level must be >= 0') + if level > 0: + if not isinstance(package, str): + raise TypeError('__package__ not set to a string') + elif not package: + raise ImportError('attempted relative import with no known parent ' + 'package') + if not name and level == 0: + raise ValueError('Empty module name') + +def _calc___package__(globals): + """Calculate what __package__ should be. + + __package__ is not guaranteed to be defined or could be set to None + to represent that its proper value is unknown. + + """ + package = globals.get('__package__') + spec = globals.get('__spec__') + if package is not None: + if spec is not None and package != spec.parent: + _warnings.warn("__package__ != __spec__.parent " + f"({package!r} != {spec.parent!r})", + ImportWarning, stacklevel=3) + return package + elif spec is not None: + return spec.parent + else: + _warnings.warn("can't resolve package from __spec__ or __package__, " + "falling back on __name__ and __path__", + ImportWarning, stacklevel=3) + package = globals['__name__'] + if '__path__' not in globals: + package = package.rpartition('.')[0] + return package + +def _normalize_path(path): + """Normalize a path by ensuring it is a string. + + If the resulting string contains path separators, an exception is raised. + """ + parent, file_name = os.path.split(path) + if parent: + raise ValueError('{!r} must be only a file name'.format(path)) + else: + return file_name diff --git a/torch/package/_mock.py b/torch/package/_mock.py new file mode 100644 index 000000000000..d291bb58ba5e --- /dev/null +++ b/torch/package/_mock.py @@ -0,0 +1,39 @@ + +_magic_methods = ['__subclasscheck__', '__hex__', '__rmul__', + '__float__', '__idiv__', '__setattr__', '__div__', '__invert__', + '__nonzero__', '__rshift__', + '__eq__', '__pos__', '__round__', + '__rand__', '__or__', '__complex__', '__divmod__', + '__len__', '__reversed__', '__copy__', '__reduce__', + '__deepcopy__', '__rdivmod__', '__rrshift__', '__ifloordiv__', + '__hash__', '__iand__', '__xor__', '__isub__', '__oct__', + '__ceil__', '__imod__', '__add__', '__truediv__', + '__unicode__', '__le__', '__delitem__', '__sizeof__', '__sub__', + '__ne__', '__pow__', '__bytes__', '__mul__', + '__itruediv__', '__bool__', '__iter__', '__abs__', + '__gt__', '__iadd__', '__enter__', + '__floordiv__', '__call__', '__neg__', + '__and__', '__ixor__', '__getitem__', '__exit__', '__cmp__', + '__getstate__', '__index__', '__contains__', '__floor__', '__lt__', '__getattr__', + '__mod__', '__trunc__', '__delattr__', '__instancecheck__', '__setitem__', '__ipow__', + '__ilshift__', '__long__', '__irshift__', '__imul__', + '__lshift__', '__dir__', '__ge__', '__int__', '__ior__'] + + +class MockedObject: + _name: str + + def __init__(self, name): + self.__dict__['_name'] = name + + def __repr__(self): + return f"MockedObject({self._name})" + + +def install_method(method_name): + def _not_implemented(self, *args, **kwargs): + raise NotImplementedError(f"Object '{self._name}' was mocked out during packaging but it is being used in {method_name}") + setattr(MockedObject, method_name, _not_implemented) + +for method_name in _magic_methods: + install_method(method_name) diff --git a/torch/package/_mock_zipreader.py b/torch/package/_mock_zipreader.py new file mode 100644 index 000000000000..b273d41fba51 --- /dev/null +++ b/torch/package/_mock_zipreader.py @@ -0,0 +1,48 @@ +import torch +from glob import glob +import os.path +from typing import List, Any + +_storages : List[Any] = [ + torch.DoubleStorage, + torch.FloatStorage, + torch.LongStorage, + torch.IntStorage, + torch.ShortStorage, + torch.CharStorage, + torch.ByteStorage, + torch.BoolStorage, +] +_dtype_to_storage = { + data_type(0).dtype: data_type for data_type in _storages +} + +# because get_storage_from_record returns a tensor!? +class _HasStorage(object): + def __init__(self, storage): + self._storage = storage + + def storage(self): + return self._storage + + +class MockZipReader(object): + def __init__(self, directory): + self.directory = directory + + def get_record(self, name): + filename = f'{self.directory}/{name}' + with open(filename, 'rb') as f: + return f.read() + + def get_storage_from_record(self, name, numel, dtype): + storage = _dtype_to_storage[dtype] + filename = f'{self.directory}/{name}' + return _HasStorage(storage.from_file(filename=filename, size=numel)) + + def get_all_records(self, ): + files = [] + for filename in glob(f'{self.directory}/**', recursive=True): + if not os.path.isdir(filename): + files.append(filename[len(self.directory) + 1:]) + return files diff --git a/torch/package/exporter.py b/torch/package/exporter.py new file mode 100644 index 000000000000..8530f6f68f3a --- /dev/null +++ b/torch/package/exporter.py @@ -0,0 +1,435 @@ +import torch +from torch.serialization import normalize_storage_type, location_tag, _should_read_directly +import io +import pickle +import pickletools +from .find_file_dependencies import find_files_source_depends_on +from ._custom_import_pickler import CustomImportPickler +from ._importlib import _normalize_path +import types +import importlib +from typing import List, Any, Callable, Dict +from distutils.sysconfig import get_python_lib +from pathlib import Path +import linecache +import sys + +class PackageExporter: + """ Exporters allow you to write packages of code, pickled python data, and + arbitrary binary and text resources into a self-contained package. + + Imports can load this code in a hermetic way, such that code is loaded + from the package rather than the normal python import system. This allows + for the packaging of PyTorch model code and data so that it can be run + on a server or used in the future for transfer learning. + + The code contained in packages is copied file-by-file from the original + source when it is created, and the file format is a specially organized + zip file. Future users of the package can unzip the package, and edit the code + in order to perform custom modifications to it. + + The importer for packages ensures that code in the module can only be loaded from + within the package, except for modules explicitly listed as external using :method:`extern_module`. + The file `extern_modules` in the zip archive lists all the modules that a package externally depends on. + This prevents "implicit" dependencies where the package runs locally because it is importing + a locally-installed package, but then fails when the package is copied to another machine. + + + Dependencies + ------------ + + When source code is added to the package, the exporter optionally can scan it + for further code dependencies (`dependencies=True`). It looks for import statements, + resolves relative references to qualified module names, and calls :method:`require_module` + on each it finds, recursively resolving dependencies. + + """ + + importers: List[Callable[[str], Any]] + """ A list of functions that will be called in order to find the module assocated + with module names referenced by other modules or by pickled objects. Initialized to + `[importlib.import_module]` by default. When pickling code or objects that was loaded + from an imported packaged, that `importer.import_module` should be put into the importer list. + When a name conflict occurs between importers, the first importer in the list takes precedence, + and only objects that refer to this first importers class can be saved + """ + + + def __init__(self, filename: str, verbose: bool = True): + """ + Create an exporter. + + Args: + filename: e.g. my_package.zip + verbose: Print information about dependency resolution to stdout. + Useful for tracking down why certain files get included. + """ + self.zip_file = torch._C.PyTorchFileWriter(filename) + self.serialized_storages : Dict[str, Any] = {} + self.external : List[str] = [] + self.provided : Dict[str, bool] = {} + self.verbose = verbose + self.importers = [importlib.import_module] + + def save_source_file(self, module_name: str, file_or_directory: str, dependencies=True): + """Adds the local file system `file_or_directory` to the source package to provide the code + for `module_name`. + + Args: + module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package. + file_or_directory (str): the path to a file or directory of code. When a directory, all python files in the directory + are recursively copied using :meth:`save_source_file`. If a file is named "/__init__.py" the code is treated + as a package. + dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`). + """ + path = Path(file_or_directory) + if path.is_dir(): + to_save = [] # list of tuples with arguments to save_source_string + module_path = module_name.replace('.', '/') + for filename in path.glob('**/*.py'): + relative_path = filename.relative_to(path).as_posix() + archivename = module_path + '/' + relative_path + if filename.is_dir(): + self.provided[archivename] = True + else: + submodule_name = None + if filename.name == '__init__.py': + submodule_name = archivename[:-len('/__init__.py')].replace('/', '.') + is_package = True + else: + submodule_name = archivename[:-len('.py')].replace('/', '.') + is_package = False + + self.provided[submodule_name] = True + # we delay the call to save_source_string so that we record all the source files + # being provided by this directory structure _before_ attempting to resolve the dependencies + # on the source. This makes sure we don't try to copy over modules that will just get + # overwritten by this directory blob + to_save.append((submodule_name, _read_file(str(filename)), is_package, dependencies, str(filename))) + + for item in to_save: + self.save_source_string(*item) + else: + is_package = path.name == '__init__.py' + self.save_source_string(module_name, _read_file(file_or_directory), is_package, dependencies, file_or_directory) + + def save_source_string(self, module_name: str, src: str, is_package: bool = False, + dependencies: bool = True, orig_file_name: str = None): + """Adds `src` as the source code for `module_name` in the exported package. + + Args: + module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package. + src (str): The python source code to save for this package + is_package (bool, optional): If True, this module is treated as a package. Packages are allowed to have submodules + (e.g. my_package.my_subpackage.my_subsubpackage), and resources can be saved inside them. Defaults to False. + dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`). + orig_file_name (str, optional): If present, used in logging to identifying where the source came from. Defaults to None. + """ + self.provided[module_name] = True + extension = '/__init__.py' if is_package else '.py' + filename = module_name.replace('.', '/') + extension + self._write(filename, src) + if dependencies: + package = module_name if is_package else module_name.rsplit('.', maxsplit=1)[0] + dep_list = find_files_source_depends_on(src, package) + if self.verbose: + def fmt_dep(mod, obj): + return f'{mod}' if obj is None else f'{mod}.{obj}' + dep_str = ''.join(f' {fmt_dep(mod, obj)}\n' for mod, obj in dep_list) + file_info = f'(from file {orig_file_name}) ' if orig_file_name is not None else '' + print(f"{module_name} {file_info}depends on:\n{dep_str}\n") + + for dep_module_name, dep_module_obj in dep_list: + # handle the case where someone did something like `from pack import sub` + # where `sub` is a submodule. In this case we don't have to save pack, just sub. + # this ensures we don't pick up additional dependencies on pack. + # However, in the case where `sub` is not a submodule but an object, then we do have + # to save pack. + if dep_module_obj is not None: + possible_submodule = f'{dep_module_name}.{dep_module_obj}' + if self._module_exists(possible_submodule): + self.require_module_if_not_provided(possible_submodule) + # we don't need to save `pack` + continue + if self._module_exists(dep_module_name): + self.require_module_if_not_provided(dep_module_name) + + def _module_exists(self, module_name: str) -> bool: + try: + self._import_module(module_name) + return True + except ModuleNotFoundError: + return False + + def _get_source_of_module(self, module: types.ModuleType) -> str: + filename = getattr(module, '__file__', None) + result = None if filename is None else linecache.getlines(filename, module.__dict__) + if result is None: + raise ValueError(f'cannot save source for module "{module.__name__}" because ' + f'its source file "{filename}" could not be found.') + return ''.join(result) + + def require_module_if_not_provided(self, module_name: str, dependencies=True): + if self._module_is_already_provided(module_name): + return + self.require_module(module_name, dependencies) + + def require_module(self, module_name: str, dependencies=True): + """This is called by dependencies resolution when it finds that something in the package + depends on the module and it is not already present. It then decides how to provide that module. + The default resolution rules will mark the module as extern if it is part of the standard library, + and call `save_module` otherwise. Clients can subclass this object + and override this method to provide other behavior, such as automatically mocking out a whole class + of modules""" + + root_name = module_name.split('.', maxsplit=1)[0] + if self._can_implicitly_extern(root_name): + if self.verbose: + print(f'implicitly adding {root_name} to external modules ' + f'since it is part of the standard library and is a dependency.') + self.extern_module(root_name) + return + + self.save_module(module_name, dependencies) + + def save_module(self, module_name: str, dependencies=True): + """Save the code for `module_name` into the package. Code for the module is resolved using the `importers` path to find the + module object, and then using its `__file__` attribute to find the source code. + Args: + module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package. + dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`). + """ + module = self._import_module(module_name) + source = self._get_source_of_module(module) + self.save_source_string(module_name, source, hasattr(module, '__path__'), dependencies, module.__file__) + + + def _import_module(self, module_name): + last_err = None + for import_module in self.importers: + try: + return import_module(module_name) + except ModuleNotFoundError as err: + last_err = err + if last_err is not None: + raise last_err + else: + raise ModuleNotFoundError(module_name) + + def _create_pickler(self, data_buf): + if self.importers == [importlib.import_module]: + # if we are using the normal import library system, then + # we can use the C implementation of pickle which is faster + return pickle.Pickler(data_buf, protocol=3) + else: + return CustomImportPickler(self._import_module, data_buf, protocol=3) + + def save_pickle(self, package: str, resource: str, obj: Any, dependencies: bool = True): + """Save a python object to the archive using pickle. Equivalent to :func:`torch.save` but saving into + the archive rather than a stand-alone file. Stanard pickle does not save the code, only the objects. + If `dependencies` is true, this method will also scan the pickled objects for which modules are required + to reconstruct them and save the relevant code. + + To be able to save an object where `type(obj).__name__` is `my_module.MyObject`, + `my_module.MyObject` must resolve to the class of the object according to the `importer` order. When saving objects that + have previously been packaged, the importer's `import_module` method will need to be present in the `importer` list + for this to work. + + Args: + package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage") + resource (str): A unique name for the resource, used to indentify it to load. + obj (Any): The object to save, must be picklable. + dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`). + """ + filename = self._filename(package, resource) + # Write the pickle data for `obj` + data_buf = io.BytesIO() + pickler = self._create_pickler(data_buf) + pickler.persistent_id = self._persistent_id + pickler.dump(obj) + data_value = data_buf.getvalue() + + if dependencies: + all_dependencies = [] + for opcode, arg, pos in pickletools.genops(data_value): + if opcode.name == 'GLOBAL': # a global reference + assert isinstance(arg, str) + module, field = arg.split(' ') + if module not in all_dependencies: + all_dependencies.append(module) + + if self.verbose: + dep_string = ''.join(f' {dep}\n' for dep in all_dependencies) + print(f"{resource} depends on:\n{dep_string}\n") + + for module_name in all_dependencies: + self.require_module_if_not_provided(module_name) + + self._write(filename, data_value) + + def save_text(self, package: str, resource: str, text: str): + """Save text data to the package + + Args: + package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage") + resource (str): A unique name for the resource, used to indentify it to load. + text (str): The contents to save + """ + return self.save_binary(package, resource, text.encode('utf-8')) + + def save_binary(self, package, resource, binary: bytes): + """Save raw bytes to the package. + + Args: + package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage") + resource (str): A unique name for the resource, used to indentify it to load. + binary (str): The data to save. + """ + filename = self._filename(package, resource) + self._write(filename, binary) + + def extern_module(self, module_name: str): + """Include `module` in the list of external modules the package can import. + This will prevent dependency discover from saving + it in the package. The importer will load an external module directly from the standard import system. + Code for extern modules must also exist in the process loading the package. + + Args: + module_name (str): e.g. "my_package.my_subpackage" the name of the external module + """ + if module_name not in self.external: + self.external.append(module_name) + + def extern_modules(self, module_names: List[str]): + """Extern a list of modules. Convience wrapper for calling :meth:`extern_module` on many items. + + Args: + module_names (List[str]): List of module names + """ + for m in module_names: + self.extern_module(m) + + def mock_module(self, module_name: str): + """Replace the code for `module_name` in the package with a fake implementation. This module will return a fake + object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes + find files that are imported by model files but whose functionality is never used + (e.g. custom serialization code or training helpers). + Use this function to mock this functionality out without having to modify the original code. + + Args: + module_name (str): e.g. "my_package.my_subpackage" the name of the module to be mocked out. + """ + if '_mock' not in self.provided: + self.save_source_file('_mock', str(Path(__file__).parent / '_mock.py'), dependencies=False) + is_package = hasattr(self._import_module(module_name), '__path__') + self.save_source_string(module_name, _MOCK_IMPL, is_package, dependencies=False) + + + def mock_modules(self, module_names): + """Mock a list of modules. Convience wrapper for calling :meth:`mock_module` on many items. + + Args: + module_names (List[str]): List of module names + """ + for module_name in module_names: + self.mock_module(module_name) + + def _module_is_already_provided(self, qualified_name: str) -> bool: + for mod in self.external: + if qualified_name == mod or qualified_name.startswith(mod + '.'): + return True + return qualified_name in self.provided + + def _persistent_id(self, obj): + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if torch.is_storage(obj): + storage_type = normalize_storage_type(type(obj)) + obj_key = str(obj._cdata) + location = location_tag(obj) + self.serialized_storages[obj_key] = obj + + return ('storage', + storage_type, + obj_key, + location, + obj.size()) + return None + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def _write(self, filename, str_or_bytes): + if isinstance(str_or_bytes, str): + str_or_bytes = str_or_bytes.encode('utf-8') + self.zip_file.write_record(filename, str_or_bytes, len(str_or_bytes)) + + def close(self): + """Write the package to the filesystem. Any calls after close are now invalid. + It is preferable to use resource guard syntax instead: + + with PackageExporter("file.zip") as e: + ... + """ + # Write each tensor to a file named tensor/the_tensor_key in the zip archive + for key in sorted(self.serialized_storages.keys()): + name = 'data/{}'.format(key) + storage = self.serialized_storages[key] + if storage.device.type == 'cpu': + # If it's on the CPU we can directly copy it into the zip file + num_bytes = storage.size() * storage.element_size() + self.zip_file.write_record(name, storage.data_ptr(), num_bytes) + else: + # Copy to a buffer, then serialize that + buf = io.BytesIO() + storage._write_file(buf, _should_read_directly(buf)) + buf_value = buf.getvalue() + self._write(name, buf_value) + contents = ('\n'.join(self.external) + '\n') + self._write('extern_modules', contents) + del self.zip_file + + def _filename(self, package, resource): + package_path = package.replace('.', '/') + resource = _normalize_path(resource) + return f'{package_path}/{resource}' + + def _can_implicitly_extern(self, module_name: str): + return module_name == 'torch' or (module_name not in _DISALLOWED_MODULES + and _is_builtin_or_stdlib_module(self._import_module(module_name))) + + +# even though these are in the standard library, we do not allow them to be +# automatically externed since they offer a lot of system level access +_DISALLOWED_MODULES = ['sys', 'io'] + +def _is_builtin_or_stdlib_module(module: types.ModuleType) -> bool: + if module.__name__ in sys.builtin_module_names: + return True + filename = module.__file__ + if filename is None: + return False + standard_lib = get_python_lib(standard_lib=True) + # this is often a subdirectory of standard_lib so we have to check + # that the file is in the standard_lib directory but not in this one + installed_libs = get_python_lib(standard_lib=False) + in_standard_lib = filename.startswith(standard_lib + '/') + in_installed_libs = filename.startswith(installed_libs + '/') + return in_standard_lib and not in_installed_libs + +_MOCK_IMPL = """\ +from _mock import MockedObject +def __getattr__(attr: str): + return MockedObject(__name__ + '.' + attr) +""" + +def _read_file(filename: str) -> str: + with open(filename, 'rb') as f: + b = f.read() + return b.decode('utf-8') diff --git a/torch/package/find_file_dependencies.py b/torch/package/find_file_dependencies.py new file mode 100644 index 000000000000..25b501e37adc --- /dev/null +++ b/torch/package/find_file_dependencies.py @@ -0,0 +1,42 @@ +from typing import List, Optional, Tuple +import ast +from ._importlib import _resolve_name + +class _ExtractModuleReferences(ast.NodeVisitor): + """ + Extract the list of global variables a block of code will read and write + """ + + @classmethod + def run(cls, src: str, package: str) -> List[Tuple[str, Optional[str]]]: + visitor = cls(package) + tree = ast.parse(src) + visitor.visit(tree) + return list(visitor.references.keys()) + + def __init__(self, package): + super().__init__() + self.package = package + self.references = {} + + def _absmodule(self, module_name: str, level: int) -> str: + if level > 0: + return _resolve_name(module_name, self.package, level) + return module_name + + def visit_Import(self, node): + for alias in node.names: + self.references[(alias.name, None)] = True + + def visit_ImportFrom(self, node): + name = self._absmodule(node.module, 0 if node.level is None else node.level) + for alias in node.names: + # from my_package import foo + # foo may be a module, so we have to add it to the list of + # potential references, if import of it fails, we will ignore it + if alias.name != '*': + self.references[(name, alias.name)] = True + else: + self.references[(name, None)] = True + +find_files_source_depends_on = _ExtractModuleReferences.run diff --git a/torch/package/importer.py b/torch/package/importer.py new file mode 100644 index 000000000000..59c7cd9d0312 --- /dev/null +++ b/torch/package/importer.py @@ -0,0 +1,388 @@ +from typing import List, Callable, Dict, Optional, Any, Union +import builtins +import importlib +from torch.serialization import _load +import pickle +import torch +import _compat_pickle # type: ignore +import types +import os.path + +from ._importlib import _normalize_line_endings, _resolve_name, _sanity_check, _calc___package__, \ + _normalize_path +from ._mock_zipreader import MockZipReader + +class PackageImporter: + """Importers allow you to load code written to packages by PackageExporter. + Code is loaded in a hermetic way, using files from the package + rather than the normal python import system. This allows + for the packaging of PyTorch model code and data so that it can be run + on a server or used in the future for transfer learning. + + The importer for packages ensures that code in the module can only be loaded from + within the package, except for modules explicitly listed as external during export. + The file `extern_modules` in the zip archive lists all the modules that a package externally depends on. + This prevents "implicit" dependencies where the package runs locally because it is importing + a locally-installed package, but then fails when the package is copied to another machine. + """ + + modules : Dict[str, Optional[types.ModuleType]] + """The dictionary of already loaded modules from this package, equivalent to `sys.modules` but + local to this importer. + """ + + def __init__(self, filename: str, module_allowed: Callable[[str], bool] = lambda module_name: True): + """Open `filename` for importing. This checks that the imported package only requires modules + allowed by `module_allowed` + + Args: + filename (str): archive to load. Can also be a directory of the unzipped files in the archive + for easy debugging and editing. + module_allowed (Callable[[str], bool], optional): A method to determine if a externally provided module + should be allowed. Can be used to ensure packages loaded do not depend on modules that the server + does not support. Defaults to allowing anything. + + Raises: + ImportError: If the package will use a disallowed module. + """ + self.filename = filename + self.zip_reader : Any + if not os.path.isdir(self.filename): + self.zip_reader = torch._C.PyTorchFileReader(self.filename) + else: + self.zip_reader = MockZipReader(self.filename) + + self.root = _PackageNode(None) + self.modules = {} + self.extern_modules = self._read_extern() + + for extern_module in self.extern_modules: + if not module_allowed(extern_module): + raise ImportError(f"package '{filename}' needs the external module '{extern_module}' " + f"but that module has been disallowed") + self._add_extern(extern_module) + + for filename in self.zip_reader.get_all_records(): + self._add_file(filename) + + self.patched_builtins = builtins.__dict__.copy() + self.patched_builtins['__import__'] = self.__import__ + # allow pickles from archive using `import resources` + self.modules['resources'] = self # type: ignore + + # used for torch.serialization._load + self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs) + + def import_module(self, name: str, package=None): + """Load a module from the package if it hasn't already been loaded, and then return + the module. Modules are loaded locally + to the importer and will appear in `self.modules` rather than `sys.modules` + + Args: + name (str): Fully qualified name of the module to load. + package ([type], optional): Unused, but present to match the signature of importlib.import_module. Defaults to None. + + Returns: + types.ModuleType: the (possibly already) loaded module. + """ + return self._gcd_import(name) + + def load_binary(self, package: str, resource: str) -> bytes: + """Load raw bytes. + + Args: + package (str): The name of module package (e.g. "my_package.my_subpackage") + resource (str): The unique name for the resource. + + Returns: + bytes: The loaded data. + """ + + path = self._zipfile_path(package, resource) + return self.zip_reader.get_record(path) + + def load_text(self, package: str, resource: str, encoding: str = 'utf-8', errors: str = 'strict') -> str: + """Load a string. + + Args: + package (str): The name of module package (e.g. "my_package.my_subpackage") + resource (str): The unique name for the resource. + encoding (str, optional): Passed to `decode`. Defaults to 'utf-8'. + errors (str, optional): Passed to `decode`. Defaults to 'strict'. + + Returns: + str: The loaded text. + """ + data = self.load_binary(package, resource) + return data.decode(encoding, errors) + + def load_pickle(self, package: str, resource: str, map_location=None) -> Any: + """Unpickles the resource from the package, loading any modules that are needed to construct the objects + using :meth:`import_module` + + Args: + package (str): The name of module package (e.g. "my_package.my_subpackage") + resource (str): The unique name for the resource. + map_location: Passed to `torch.load` to determine how tensors are mapped to devices. Defaults to None. + + Returns: + Any: the unpickled object. + """ + pickle_file = self._zipfile_path(package, resource) + return _load(self.zip_reader, map_location, self, pickle_file=pickle_file) + + + def _read_extern(self): + return self.zip_reader.get_record('extern_modules').decode('utf-8').splitlines(keepends=False) + + def _make_module(self, name: str, filename: Optional[str], is_package: bool): + spec = importlib.machinery.ModuleSpec(name, self, is_package=is_package) # type: ignore + module = importlib.util.module_from_spec(spec) + self.modules[name] = module + ns = module.__dict__ + ns['__spec__'] = spec + ns['__loader__'] = self + ns['__file__'] = filename + ns['__cached__'] = None + ns['__builtins__'] = self.patched_builtins + if filename is not None: + code = self._compile_source(filename) + exec(code, ns) + return module + + def _load_module(self, name: str): + cur : _PathNode = self.root + for atom in name.split('.'): + if not isinstance(cur, _PackageNode) or atom not in cur.children: + raise ModuleNotFoundError( + f'No module named "{name}" in self-contained archive "{self.filename}"' + f' and the module is also not in the list of allowed external modules: {self.extern_modules}') + cur = cur.children[atom] + if isinstance(cur, _ExternNode): + module = self.modules[name] = importlib.import_module(name) + return module + return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode)) # type: ignore + + def _compile_source(self, fullpath): + source = self.zip_reader.get_record(fullpath) + source = _normalize_line_endings(source) + return compile(source, fullpath, 'exec', dont_inherit=True) + + # note: named `get_source` so that linecache can find the source + # when this is the __loader__ of a module. + def get_source(self, module_name) -> str: + module = self.import_module(module_name) + return self.zip_reader.get_record(module.__file__).decode('utf-8') + + # note: copied from cpython's import code, with call to create module replaced with _make_module + def _do_find_and_load(self, name): + path = None + parent = name.rpartition('.')[0] + if parent: + if parent not in self.modules: + self._gcd_import(parent) + # Crazy side-effects! + if name in self.modules: + return self.modules[name] + parent_module = self.modules[parent] + try: + path = parent_module.__path__ # type: ignore + except AttributeError: + msg = (_ERR_MSG + '; {!r} is not a package').format(name, parent) + raise ModuleNotFoundError(msg, name=name) from None + + module = self._load_module(name) + + if parent: + # Set the module as an attribute on its parent. + parent_module = self.modules[parent] + if parent_module.__loader__ is self: # type: ignore + setattr(parent_module, name.rpartition('.')[2], module) + return module + + # note: copied from cpython's import code + def _find_and_load(self, name): + module = self.modules.get(name, _NEEDS_LOADING) + if module is _NEEDS_LOADING: + return self._do_find_and_load(name) + + if module is None: + message = ('import of {} halted; ' + 'None in sys.modules'.format(name)) + raise ModuleNotFoundError(message, name=name) + + return module + + + def _gcd_import(self, name, package=None, level=0): + """Import and return the module based on its name, the package the call is + being made from, and the level adjustment. + + This function represents the greatest common denominator of functionality + between import_module and __import__. This includes setting __package__ if + the loader did not. + + """ + _sanity_check(name, package, level) + if level > 0: + name = _resolve_name(name, package, level) + + return self._find_and_load(name) + + # note: copied from cpython's import code + def _handle_fromlist(self, module, fromlist, *, recursive=False): + """Figure out what __import__ should return. + + The import_ parameter is a callable which takes the name of module to + import. It is required to decouple the function from assuming importlib's + import implementation is desired. + + """ + # The hell that is fromlist ... + # If a package was imported, try to import stuff from fromlist. + if hasattr(module, '__path__'): + for x in fromlist: + if not isinstance(x, str): + if recursive: + where = module.__name__ + '.__all__' + else: + where = "``from list''" + raise TypeError(f"Item in {where} must be str, " + f"not {type(x).__name__}") + elif x == '*': + if not recursive and hasattr(module, '__all__'): + self._handle_fromlist(module, module.__all__, + recursive=True) + elif not hasattr(module, x): + from_name = '{}.{}'.format(module.__name__, x) + try: + self._gcd_import(from_name) + except ModuleNotFoundError as exc: + # Backwards-compatibility dictates we ignore failed + # imports triggered by fromlist for modules that don't + # exist. + if (exc.name == from_name and + self.modules.get(from_name, _NEEDS_LOADING) is not None): + continue + raise + return module + + def __import__(self, name, globals=None, locals=None, fromlist=(), level=0): + if level == 0: + module = self._gcd_import(name) + else: + globals_ = globals if globals is not None else {} + package = _calc___package__(globals_) + module = self._gcd_import(name, package, level) + if not fromlist: + # Return up to the first dot in 'name'. This is complicated by the fact + # that 'name' may be relative. + if level == 0: + return self._gcd_import(name.partition('.')[0]) + elif not name: + return module + else: + # Figure out where to slice the module's name up to the first dot + # in 'name'. + cut_off = len(name) - len(name.partition('.')[0]) + # Slice end needs to be positive to alleviate need to special-case + # when ``'.' not in name``. + return self.modules[module.__name__[:len(module.__name__) - cut_off]] + else: + return self._handle_fromlist(module, fromlist) + + def _get_package(self, package): + """Take a package name or module object and return the module. + + If a name, the module is imported. If the passed or imported module + object is not a package, raise an exception. + """ + if hasattr(package, '__spec__'): + if package.__spec__.submodule_search_locations is None: + raise TypeError('{!r} is not a package'.format( + package.__spec__.name)) + else: + return package + else: + module = self.import_module(package) + if module.__spec__.submodule_search_locations is None: + raise TypeError('{!r} is not a package'.format(package)) + else: + return module + + def _zipfile_path(self, package, resource): + package = self._get_package(package) + resource = _normalize_path(resource) + assert package.__loader__ is self + return f"{package.__name__.replace('.', '/')}/{resource}" + + def _get_or_create_package(self, atoms: List[str]) -> 'Union[_PackageNode, _ExternNode]': + cur = self.root + for i, atom in enumerate(atoms): + node = cur.children.get(atom, None) + if node is None: + node = cur.children[atom] = _PackageNode(None) + if isinstance(node, _ExternNode): + return node + if isinstance(node, _ModuleNode): + name = ".".join(atoms[:i]) + raise ImportError(f'inconsistent module structure. module {name} is not a package, but has submodules') + assert isinstance(node, _PackageNode) + cur = node + return cur + + def _add_file(self, filename: str): + *prefix, last = filename.split('/') + package = self._get_or_create_package(prefix) + if isinstance(package, _ExternNode): + raise ImportError(f'inconsistent module structure. package contains a module file {filename}' + f' that is a subpackage of a module marked external.') + if last == '__init__.py': + package.source_file = filename + elif last.endswith('.py'): + package_name = last.rstrip('.py') + package.children[package_name] = _ModuleNode(filename) + + def _add_extern(self, extern_name: str): + *prefix, last = extern_name.split('.') + package = self._get_or_create_package(prefix) + if isinstance(package, _ExternNode): + return # the shorter extern covers this extern case + package.children[last] = _ExternNode() + + +_NEEDS_LOADING = object() +_ERR_MSG_PREFIX = 'No module named ' +_ERR_MSG = _ERR_MSG_PREFIX + '{!r}' + +class _UnpicklerWrapper(pickle._Unpickler): # type: ignore + def __init__(self, importer, *args, **kwargs): + super().__init__(*args, **kwargs) + self._importer = importer + + def find_class(self, module, name): + # Subclasses may override this. + if self.proto < 3 and self.fix_imports: + if (module, name) in _compat_pickle.NAME_MAPPING: + module, name = _compat_pickle.NAME_MAPPING[(module, name)] + elif module in _compat_pickle.IMPORT_MAPPING: + module = _compat_pickle.IMPORT_MAPPING[module] + mod = self._importer.import_module(module) + return getattr(mod, name) + +class _PathNode: + pass + +class _PackageNode(_PathNode): + def __init__(self, source_file: Optional[str]): + self.source_file = source_file + self.children : Dict[str, _PathNode] = {} + +class _ModuleNode(_PathNode): + __slots__ = ['source_file'] + + def __init__(self, source_file: str): + self.source_file = source_file + +class _ExternNode(_PathNode): + pass diff --git a/torch/serialization.py b/torch/serialization.py index c68c1ff0b60d..1c05767922a8 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -821,7 +821,7 @@ def restore_location(storage, location): return restore_location -def _load(zip_file, map_location, pickle_module, **pickle_load_args): +def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickle_load_args): restore_location = _get_restore_location(map_location) loaded_storages = {} @@ -847,7 +847,7 @@ def persistent_load(saved_id): return storage # Load the data (which may in turn use `persistent_load` to load tensors) - data_file = io.BytesIO(zip_file.get_record('data.pkl')) + data_file = io.BytesIO(zip_file.get_record(pickle_file)) unpickler = pickle_module.Unpickler(data_file, **pickle_load_args) unpickler.persistent_load = persistent_load result = unpickler.load() From 25ed739ac90cb5fa82963131411cc783de0bd8fd Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Tue, 22 Sep 2020 21:15:15 -0700 Subject: [PATCH 042/449] [packaging] rstrip fix (#45166) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45166 Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D23852505 Pulled By: zdevito fbshipit-source-id: 6bb743b37333ae19fc24629686e8d06aef812c50 --- torch/package/importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/package/importer.py b/torch/package/importer.py index 59c7cd9d0312..1a02e69436fa 100644 --- a/torch/package/importer.py +++ b/torch/package/importer.py @@ -340,7 +340,7 @@ def _add_file(self, filename: str): if last == '__init__.py': package.source_file = filename elif last.endswith('.py'): - package_name = last.rstrip('.py') + package_name = last[:-len('.py')] package.children[package_name] = _ModuleNode(filename) def _add_extern(self, extern_name: str): From 0a9ac98bed5d3b14566f19e584071764d570cb8c Mon Sep 17 00:00:00 2001 From: Jiakai Liu Date: Tue, 22 Sep 2020 21:40:37 -0700 Subject: [PATCH 043/449] [reland][pytorch] refine dispatch keys in native_functions.yaml (1/N) (#45137) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45137 Reland https://github.com/pytorch/pytorch/pull/45010 - which broke master due to merge conflict. Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23843510 Pulled By: ljk53 fbshipit-source-id: 28aabb9da533b6b806ab8779a0ee96b695e9e242 --- aten/src/ATen/native/native_functions.yaml | 140 ++++++++++++++++++++- 1 file changed, 135 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index d5a746e2a522..c61f021f8c5f 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -167,13 +167,13 @@ - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) variants: function dispatch: - CUDA: fused_dropout_cuda + CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor use_c10_dispatcher: full variants: function dispatch: - CUDA: masked_scale_cuda + CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -299,6 +299,8 @@ variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor use_c10_dispatcher: full @@ -313,6 +315,8 @@ variants: function, method - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acos_out # arccos, alias of acos - func: arccos(Tensor self) -> Tensor @@ -489,6 +493,8 @@ variants: function, method - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acosh_out # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor @@ -510,6 +516,8 @@ variants: function, method - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asinh_out # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor @@ -531,6 +539,8 @@ variants: function, method - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atanh_out # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor @@ -591,6 +601,8 @@ variants: function, method - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan_out # arctan, alias of atan - func: arctan(Tensor self) -> Tensor @@ -682,6 +694,8 @@ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -926,12 +940,16 @@ variants: function - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor use_c10_dispatcher: full variants: function - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor use_c10_dispatcher: full @@ -1005,6 +1023,8 @@ variants: function, method - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1015,6 +1035,8 @@ variants: function, method - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1198,7 +1220,7 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: - CPU: ctc_loss_cpu + CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor @@ -1464,6 +1486,8 @@ variants: function, method - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1474,6 +1498,8 @@ variants: function, method - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1484,6 +1510,8 @@ variants: function, method - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1494,6 +1522,8 @@ variants: function, method - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1608,6 +1638,8 @@ variants: function, method - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: frac_out - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -1626,6 +1658,8 @@ CPU: from_file - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1636,6 +1670,8 @@ variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2004,12 +2040,16 @@ CPU, CUDA: log2_out - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2724,6 +2764,8 @@ variants: function, method - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2910,6 +2952,8 @@ - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -2931,6 +2975,8 @@ MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full @@ -2945,6 +2991,8 @@ CPU, CUDA: logit_ - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2967,6 +3015,8 @@ variants: function, method - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sinh_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -3177,6 +3227,8 @@ variants: function, method - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3246,6 +3298,8 @@ variants: function, method - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3259,6 +3313,8 @@ variants: function, method - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor use_c10_dispatcher: full @@ -3606,8 +3662,8 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: _sparse_sum_backward_cpu - SparseCUDA: _sparse_sum_backward_cuda + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -4809,6 +4865,8 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) use_c10_dispatcher: full @@ -4827,6 +4885,8 @@ - func: digamma_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) use_c10_dispatcher: full @@ -4916,27 +4976,41 @@ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: geometric_ # wrappers for TH functions @@ -5390,6 +5464,8 @@ use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5400,6 +5476,8 @@ variants: method - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5619,12 +5697,16 @@ CPU, CUDA: lgamma - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor use_c10_dispatcher: full @@ -5657,6 +5739,8 @@ variants: function, method - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5684,6 +5768,8 @@ variants: method, function - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -5750,19 +5836,27 @@ CUDA: fmod_cuda - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: hypot - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: nextafter - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method @@ -6487,10 +6581,14 @@ - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6500,6 +6598,8 @@ - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6533,6 +6633,8 @@ - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6544,6 +6646,8 @@ - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6572,6 +6676,8 @@ - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6582,14 +6688,20 @@ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6613,6 +6725,8 @@ - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) use_c10_dispatcher: full @@ -6678,10 +6792,14 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6691,13 +6809,19 @@ - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6707,6 +6831,8 @@ - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7478,6 +7604,8 @@ - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: logit_backward - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7487,6 +7615,8 @@ - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: tanh_backward # What's a thnn_conv_ versus a slow_conv_? # From 989d877c95a9107fabcee1bda9a6cfacb8098d94 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 22 Sep 2020 21:41:13 -0700 Subject: [PATCH 044/449] [JIT] Do not allow creating generics with None types (#44958) Summary: Otherwise, invoking something like `python -c "import torch._C;print(torch._C.ListType(None))"` will result in SIGSEGV Discovered while trying to create a torch script for function with the following type annotation `Tuple[int, Ellipsis] -> None` Pull Request resolved: https://github.com/pytorch/pytorch/pull/44958 Reviewed By: suo Differential Revision: D23799906 Pulled By: malfet fbshipit-source-id: 916a243007d13ed3e7a5b282dd712da3d66e3bf7 --- aten/src/ATen/core/jit_type.h | 7 ++++++- aten/src/ATen/core/type.cpp | 3 +++ test/jit/test_list_dict.py | 5 +++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 74eaa7012ac1..1c9d31dd630c 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -263,7 +263,12 @@ struct SingleElementType : public Type { } protected: - SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {} + SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) { + if (!this->elem) { + throw std::runtime_error(c10::str( + "Can not create ", typeKindToString(Kind), " with None type")); + } + } private: TypePtr elem; diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 475c59759f78..13e82d434647 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -716,6 +716,9 @@ TupleType::TupleType( schema_(std::move(schema)) { has_free_variables_ = std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) { + if (!v) { + throw std::runtime_error("Can not create tuple with None type"); + } return v->hasFreeVariables(); }); if (schema_) { diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py index a1c378963918..8d0f74349b3b 100644 --- a/test/jit/test_list_dict.py +++ b/test/jit/test_list_dict.py @@ -1155,6 +1155,11 @@ def annotated_fn(x: torch.Tensor) -> List: with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"): torch.jit.script(annotated_fn) + def test_list_none(self): + with self.assertRaisesRegex(RuntimeError, "Can not create ListType with None type"): + x = torch._C.ListType(None) + + class TestDict(JitTestCase): def dict(self): From 144dacd8d9aee815692524052ea72a5ceb561fe3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 22 Sep 2020 22:41:51 -0700 Subject: [PATCH 045/449] CUDA BFloat16 batched gemm (#45167) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45167 Reviewed By: mruberry Differential Revision: D23860458 Pulled By: ngimel fbshipit-source-id: 698de424a046963a30017b58d227fa510f85bf3f --- aten/src/THC/THCBlas.cu | 65 ++++++++++++----------- aten/src/THC/THCBlas.h | 8 +-- aten/src/THC/generic/THCTensorMathBlas.cu | 6 +-- test/test_torch.py | 15 +++--- 4 files changed, 46 insertions(+), 48 deletions(-) diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index fe906ce66fa3..73d411f05ef1 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -107,30 +107,9 @@ void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, i } -// Check https://github.com/pytorch/pytorch/issues/22078 -// for information about the bug. We don't know the exact conditions that trigger it, -// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a -// necessary condition. -static void checkCuda90Bug(int i_m, int i_n, int i_k) -{ -#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000 - static std::once_flag alreadyWarned; - const int LIMIT = 1 << 21; - if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) { - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - if (prop->major == 5 || prop->major == 6) { - std::call_once(alreadyWarned, []() { - TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later."); - }); - } - } -#endif -} - /* Level 3 */ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { - checkCuda90Bug((int)m, (int)n, (int)k); at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } @@ -141,11 +120,10 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc) { - checkCuda90Bug((int)m, (int)n, (int)k); at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -#ifdef __HIP_PLATFORM_HCC__ +#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc) { at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -157,7 +135,6 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6 at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -#if CUDA_VERSION >= 9010 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB, at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -205,7 +182,6 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i #endif // CUDA_VERSION < 11000 #endif // __HIP_PLATFORM_HCC__ } -#endif // CUDA_VERSION or __HIP_PLATFORM_HCC__ #ifdef __HIP_PLATFORM_HCC__ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, @@ -236,6 +212,40 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i } #endif // __HIP_PLATFORM_HCC__ +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, + at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB, + at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount) +{ + at::globalContext().alertCuBLASConfigNotDeterministic(); + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + + { + THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major < 8) { + TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU"); + } + + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + float fAlpha = alpha; + float fBeta = beta; + THCublasCheck(cublasGemmStridedBatchedEx(handle, + opa, opb, (int)m, (int)n, (int)k, + (void*)&fAlpha, a, CUDA_R_16BF, (int)lda, strideA, + b, CUDA_R_16BF, (int)ldb, strideB, + (void*)&fBeta, c, CUDA_R_16BF, (int)ldc, strideC, + (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} +#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, float beta, float *c[], int64_t ldc, int64_t batchCount) @@ -270,7 +280,6 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t #endif } -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -294,7 +303,6 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, (int)batchCount)); } -#endif void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, @@ -330,7 +338,6 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t #endif } -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -353,5 +360,3 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, (int)batchCount)); } -#endif - diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h index cff3180a974a..a9b646a4374f 100644 --- a/aten/src/THC/THCBlas.h +++ b/aten/src/THC/THCBlas.h @@ -14,7 +14,7 @@ THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc); -#ifdef __HIP_PLATFORM_HCC__ +#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 THC_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc); #endif @@ -24,22 +24,18 @@ THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, double beta, double *c[], int64_t ldc, int64_t batchCount); -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount); THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount); -#endif -#if CUDA_VERSION >= 9010 || defined(__HIP_PLATFORM_HCC__) void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB, THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount); -#endif -#ifdef __HIP_PLATFORM_HCC__ +#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 3158e0e267ed..a5d159a9cace 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -281,7 +281,7 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t, #endif //CUDA_VERSION #elif defined(THC_REAL_IS_BFLOAT16) -#if defined(__HIP_PLATFORM_HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 THCudaBlas_BgemmStridedBatched( state, transpose_batch1, @@ -310,15 +310,13 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t, THCTensor_(freeCopyTo)(state, result_, result); } -#if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) +#if defined(THC_REAL_IS_BFLOAT16) && !(defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000) // To avoid "variable was set but never used" warning [&transpose_batch1, &transpose_batch2, &lda, &ldb, &ldc]{}(); TORCH_CHECK(false, "BgemmStridedBatched is not supported with at::BFloat16 type"); #endif } -#if !defined(THC_REAL_IS_BFLOAT16) || defined(__HIP_PLATFORM_HCC__) at::namedinference::propagate_names_if_nonempty(result, maybe_outnames); -#endif #else ERROR_ONLY_FP_TYPES("baddbmm"); diff --git a/test/test_torch.py b/test/test_torch.py index 440bf30286bb..6b529712ab5c 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -19626,8 +19626,6 @@ def test_movedim_view(self, device): # with _float_types when bfloat16 bringup is complete on all platforms _float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types -_complex_and_float_types2 = _float_types2 + _complex_types - _signed_types = [ torch.half, torch.float, torch.double, torch.int8, torch.short, torch.int, torch.long @@ -19798,20 +19796,21 @@ def inner(self, device, dtype): ('pow', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d).abs()], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()), ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, [tf32_on_and_off(0.005)]), + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, + _cpu_types, True, [tf32_on_and_off(0.005)]), ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2), + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)), ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]), ('baddbmm', 'two_scalars', _small_3d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]), ('bmm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False), From 7fba30c2be4c1373c1e4424111e5ec2b878a85da Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Tue, 22 Sep 2020 22:44:41 -0700 Subject: [PATCH 046/449] [quant][fx][bug] Fix error in convert step for QAT (#45050) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45050 Update tests to actually test for QAT Test Plan: python test/test_quantization.py TestQuantizeFxOps.test_linear Imported from OSS Reviewed By: jerryzh168 Differential Revision: D23808022 fbshipit-source-id: d749ab2d215fe19238ff9d539307ffce9ef0ca9b --- torch/quantization/fx/quantize.py | 7 ++++++- torch/testing/_internal/common_quantization.py | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 8d742255838a..6254120999f0 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -132,6 +132,10 @@ def is_activation_post_process(module): return (isinstance(module, torch.quantization.ObserverBase) or isinstance(module, torch.quantization.FakeQuantize)) +def is_submodule_of_fake_quant(name, module, named_modules): + parent_name, _ = _parent_name(name) + return is_activation_post_process(named_modules[parent_name]) + # A dictionary for querying the weight index for a given op WEIGHT_INDEX_DICT = { torch.nn.functional.conv2d : [1], @@ -529,9 +533,10 @@ def load_arg(a): env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg) act_post_process_removed_graph.output(map_arg(self.quantized_graph.result, load_arg)) + module_dict = dict(model.named_modules()) to_be_removed = [] for name, module in model.named_modules(): - if is_activation_post_process(module): + if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict): to_be_removed.append(name) for n in to_be_removed: delattr(model, n) diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 8339335bd04b..3edbd5dd7fcd 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -11,7 +11,8 @@ from torch.testing._internal.common_utils import TestCase from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \ default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \ - propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig + propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \ + get_default_qat_qconfig from torch.quantization.quantization_mappings import ( get_dynamic_quant_module_mappings, get_qconfig_propagation_list, @@ -614,12 +615,13 @@ def checkGraphModeFxOp(self, model, inputs, quant_type, if type(inputs) == list: inputs = inputs[0] if quant_type == QuantType.QAT: + qconfig_dict = {'': get_default_qat_qconfig(torch.backends.quantized.engine)} model.train() else: + qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} model.eval() original = symbolic_trace(model) - qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} if quant_type == QuantType.DYNAMIC: prepare = prepare_dynamic_fx convert = convert_dynamic_fx From 215679573ebff5a03238a7f9aa801a6c00826f19 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Tue, 22 Sep 2020 23:46:32 -0700 Subject: [PATCH 047/449] [TensorExpr] Fix operator order in combineMultilane (#45157) Summary: combineMultilane used the wrong order when ramp was on the left hand side, which matters for subtract. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45157 Test Plan: test_tensorexpr --gtest_filter=TensorExprTest.SimplifyRampSubBroadcast Reviewed By: ailzhang Differential Revision: D23851751 Pulled By: asuhan fbshipit-source-id: 864d1611e88769fb43327ef226bb3310017bf858 --- test/cpp/tensorexpr/test_simplify.cpp | 14 ++++++++++++++ test/cpp/tensorexpr/tests.h | 1 + torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index b88aa17efd3e..f8c5cdd3546d 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -3950,5 +3950,19 @@ void testSimplifySyncThreads() { } } +void testSimplifyRampSubBroadcast() { + KernelScope kernel_scope; + int num_lanes = 4; + ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes); + ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes); + ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast); + Ramp* newRamp = simplified.AsNode(); + IS_NODE_WITH_NAME(IntImm, newRamp->base(), base); + ASSERT_EQ(base->value(), 5); + IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride); + ASSERT_EQ(stride->value(), 6); + ASSERT_EQ(newRamp->lanes(), num_lanes); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index c38a368af13c..56831c8db663 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -216,6 +216,7 @@ namespace jit { _(SimplifyReorderForCond) \ _(SimplifyFuseConditions) \ _(SimplifySyncThreads) \ + _(SimplifyRampSubBroadcast) \ _(RegisterizerSimple) \ _(RegisterizerLoop) \ _(RegisterizerLoopFixedLoad) \ diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index 3429239a4491..f6852b627969 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -123,7 +123,7 @@ const Expr* combineMultilane(const Expr* lhs, const Expr* rhs) { throw malformed_input("multilane lane mismatch"); } const Expr* ret = new Ramp( - new Op(bc->value(), ramp->base()), ramp->stride(), ramp->lanes()); + new Op(ramp->base(), bc->value()), ramp->stride(), ramp->lanes()); return ret; } } From 76dc50e9c8698da338334ecdc80bb00e60186849 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Wed, 23 Sep 2020 00:44:20 -0700 Subject: [PATCH 048/449] [RPC] Infer backend type if only options are given (#45065) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45065 To preserve backwards compatibility with applications that were passing in some ProcessGroupRpcBackendOptions but were not explicitly setting backend=BackendType.PROCESS_GROUP, we're here now inferring the backend type from the options if only the latter ones are passed. If neither are passed, we'll default to TensorPipe, as before this change. ghstack-source-id: 112586258 Test Plan: Added new unit tests. Reviewed By: pritamdamania87 Differential Revision: D23814289 fbshipit-source-id: f4be7919e0817a4f539a50ab12216dc3178cb752 --- torch/csrc/distributed/rpc/init.cpp | 5 + torch/distributed/rpc/__init__.py | 59 +++++++- torch/distributed/rpc/backend_registry.py | 13 ++ .../_internal/distributed/rpc/rpc_test.py | 129 ++++++++++++++++++ 4 files changed, 203 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp index f85adb88dc09..ea1db04225c7 100644 --- a/torch/csrc/distributed/rpc/init.cpp +++ b/torch/csrc/distributed/rpc/init.cpp @@ -50,6 +50,11 @@ PyObject* rpc_init(PyObject* /* unused */) { :meth:`~torch.distributed.rpc.init_rpc` in order to initialize RPC with specific configurations, such as the RPC timeout and ``init_method`` to be used. )") + .def(py::init<>()) + .def( + py::init(), + py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds, + py::arg("init_method") = kDefaultInitMethod) .def_readwrite( "rpc_timeout", &RpcBackendOptions::rpcTimeoutSeconds, diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py index 4f1180bf954f..4598c78e72fe 100644 --- a/torch/distributed/rpc/__init__.py +++ b/torch/distributed/rpc/__init__.py @@ -1,7 +1,12 @@ +import logging +import threading import torch import torch.distributed as dist -import threading + + +logger = logging.getLogger(__name__) + _init_counter = 0 _init_counter_lock = threading.Lock() @@ -36,7 +41,7 @@ def is_available(): def init_rpc( name, - backend=BackendType.TENSORPIPE, + backend=None, rank=-1, world_size=None, rpc_backend_options=None, @@ -71,7 +76,55 @@ def init_rpc( are available. """ - if not rpc_backend_options: + if backend is not None and not isinstance(backend, backend_registry.BackendType): + raise TypeError( + "Argument backend must be a member of BackendType" + ) + + if rpc_backend_options is not None and not isinstance(rpc_backend_options, RpcBackendOptions): + raise TypeError( + "Argument rpc_backend_options must be an instance of RpcBackendOptions" + ) + + # To avoid breaking users that passed a ProcessGroupRpcBackendOptions + # without specifying the backend as PROCESS_GROUP when that was the + # default, we try to detect the backend from the options when only the + # latter is passed. + if backend is None and rpc_backend_options is not None: + for candidate_backend in BackendType: + if isinstance( + rpc_backend_options, + type( + backend_registry.construct_rpc_backend_options( + candidate_backend + ) + ), + ): + backend = candidate_backend + break + else: + raise TypeError( + f"Could not infer backend for options {rpc_backend_options}" + ) + if backend != BackendType.TENSORPIPE: + logger.warning( + f"RPC was initialized with no explicit backend but with options " + f"corresponding to {backend}, hence that backend will be used " + f"instead of the default {BackendType.TENSORPIPE}. To silence this " + f"warning pass `backend={backend}` explicitly." + ) + + if backend is None: + backend = BackendType.TENSORPIPE + + if backend == BackendType.PROCESS_GROUP: + logger.warning( + "RPC was initialized with the PROCESS_GROUP backend which is " + "deprecated and slated to be removed and superseded by the TENSORPIPE " + "backend. It is recommended to migrate to the TENSORPIPE backend." + ) + + if rpc_backend_options is None: # default construct a set of RPC backend options. rpc_backend_options = backend_registry.construct_rpc_backend_options( backend diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py index 8ca185ab1ff1..6dac7cb0863a 100644 --- a/torch/distributed/rpc/backend_registry.py +++ b/torch/distributed/rpc/backend_registry.py @@ -134,8 +134,21 @@ def _init_process_group(store, rank, world_size): def _process_group_init_backend_handler( store, name, rank, world_size, rpc_backend_options ): + from . import ProcessGroupRpcBackendOptions from . import ProcessGroupAgent + if not isinstance(store, dist.Store): + raise TypeError("`store` must be a c10d::Store. {}".format(store)) + + if not isinstance( + rpc_backend_options, ProcessGroupRpcBackendOptions + ): + raise TypeError( + "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`. {}".format( + rpc_backend_options + ) + ) + group = _init_process_group(store, rank, world_size) # TODO: add try-except and destroy _agent in all processes if any fails. diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index f469dd32ea04..e343ffc1939b 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -1,6 +1,7 @@ import concurrent.futures import contextlib import json +import logging import sys from threading import Lock import time @@ -459,6 +460,14 @@ def return_future(): return torch.futures.Future() +class FooBackendOptions(rpc.RpcBackendOptions): + def __init__(self, init_method): + # Must call the __init__ of the superclass (and do so directly, + # without using super()) because... pybind. + rpc.RpcBackendOptions.__init__(self) + self.init_method = init_method + + # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -3298,9 +3307,98 @@ def test_init_rpc_twice(self): rpc.shutdown() + def test_wrong_types(self): + with self.assertRaisesRegex( + TypeError, + "Argument backend must be a member of BackendType", + ): + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + backend="TENSORPIPE", + ) + + with self.assertRaisesRegex( + TypeError, + "Argument rpc_backend_options must be an instance of RpcBackendOptions", + ): + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + backend=self.rpc_backend, + rpc_backend_options={"init_method": self.init_method} + ) + + def test_cannot_infer_backend_from_options(self): + # An exception should be raised if the backend isn't specified but + # options are given which are not an instance of any of the known + # agents' option classes. + rpc_backend_options = FooBackendOptions(self.init_method) + + with self.assertRaisesRegex(TypeError, "Could not infer backend for options"): + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + # Do _not_ pass backend. + rpc_backend_options=rpc_backend_options, + ) + class ProcessGroupAgentRpcTest(RpcAgentTestFixture): + def test_mismatched_type_for_options(self): + # An exception should be raised if the options are not an instance of + # ProcessGroupRpcBackendOptions. + rpc_backend_options = FooBackendOptions(self.init_method) + + with self.assertRaisesRegex( + TypeError, "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`" + ): + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + backend=rpc.BackendType.PROCESS_GROUP, + rpc_backend_options=rpc_backend_options, + ) + + def test_infer_backend_from_options(self): + rpc_backend_options = rpc.ProcessGroupRpcBackendOptions( + init_method=self.init_method + ) + + with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm: + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + # Do _not_ pass backend. + rpc_backend_options=rpc_backend_options, + ) + self.assertIn( + "To silence this warning pass `backend=BackendType.PROCESS_GROUP` explicitly.", + "\n".join(cm.output), + ) + + self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.ProcessGroupAgent) + + def test_logs_deprecation_warning(self): + with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm: + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + backend=rpc.BackendType.PROCESS_GROUP, + rpc_backend_options=self.rpc_backend_options, + ) + self.assertIn( + "It is recommended to migrate to the TENSORPIPE backend.", + "\n".join(cm.output), + ) + @skip_if_lt_x_gpu(2) @dist_init def test_cuda(self): @@ -3895,6 +3993,37 @@ def test_rpc_script_timeout(self): class TensorPipeAgentRpcTest(RpcAgentTestFixture): + def test_mismatched_type_for_options(self): + # An exception should be raised if the options are not an instance of + # TensorPipeRpcBackendOptions. + rpc_backend_options = FooBackendOptions(self.init_method) + + with self.assertRaisesRegex( + TypeError, "`rpc_backend_options` must be a `TensorPipeRpcBackendOptions`" + ): + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + backend=rpc.BackendType.TENSORPIPE, + rpc_backend_options=rpc_backend_options, + ) + + def test_infer_backend_from_options(self): + rpc_backend_options = rpc.TensorPipeRpcBackendOptions( + init_method=self.init_method + ) + + rpc.init_rpc( + name=worker_name(self.rank), + rank=self.rank, + world_size=self.world_size, + # Do _not_ pass backend. + rpc_backend_options=rpc_backend_options, + ) + + self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.TensorPipeAgent) + # FIXME Merge this test with the corresponding one in RpcTest. @dist_init(setup_rpc=False) def test_set_and_get_num_worker_threads(self): From e5bade7b2cccf5cccfc812778059578c15c3e8ab Mon Sep 17 00:00:00 2001 From: Martin Yuan Date: Wed, 23 Sep 2020 07:40:51 -0700 Subject: [PATCH 049/449] [PyTorch Mobile] Move string op registrations to prim and make them selective (#44960) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44960 Since we have templated selective build, it should be safe to move the operators to prim so that they can be selectively built in mobile Test Plan: CI Reviewed By: linbinyu Differential Revision: D23772025 fbshipit-source-id: 52cebae76e4df5a6b2b51f2cd82f06f75e2e45d0 --- aten/src/ATen/templates/TypeDefault.cpp | 63 +- tools/build_variables.bzl | 1 - torch/csrc/jit/runtime/register_prim_ops.cpp | 621 ++++++++++++++++-- .../csrc/jit/runtime/register_string_ops.cpp | 499 -------------- 4 files changed, 599 insertions(+), 585 deletions(-) delete mode 100644 torch/csrc/jit/runtime/register_string_ops.cpp diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 6f2b988619c7..c1e7c9ac0c64 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -27,38 +27,37 @@ TORCH_LIBRARY(aten, m) { ${function_registrations}; // String Ops - // Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp - m.def("splitlines(str self, bool keepends=False) -> str[]"); - m.def( - "slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str"); - m.def("isupper(str self) -> bool"); - m.def("islower(str self) -> bool"); - m.def("capitalize(str self) -> str"); - m.def("title(str self) -> str"); - m.def("center(str self, int width, str fillchar=' ') -> str"); - m.def("count(str self, str substr, int start=0, int end=-1) -> int"); - m.def("endswith(str self, str substr, int start=0, int end=-1) -> bool"); - m.def("startswith(str self, str substr, int start=0, int end=-1) -> bool"); - m.def("expandtabs(str self, int tabsize=8) -> str"); - m.def("find(str self, str substr, int start=0, int end=-1) -> int"); - m.def("rfind(str self, str substr, int start=0, int end=-1) -> int"); - m.def("index.str(str self, str substr, int start=0, int end=-1) -> int"); - m.def("rindex(str self, str substr, int start=0, int end=-1) -> int"); - m.def("isidentifier(str self) -> bool"); - m.def("istitle(str self) -> bool"); - m.def("isprintable(str self) -> bool"); - m.def("ljust(str self, int width, str fillchar=' ') -> str"); - m.def("rjust(str self, int width, str fillchar=' ') -> str"); - m.def("zfill(str self, int width) -> str"); - m.def("lstrip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("rstrip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("strip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("replace(str self, str old, str new, int max=-1) -> str"); - m.def("partition(str self, str separator) -> (str, str, str)"); - m.def("rpartition(str self, str separator) -> (str, str, str)"); - m.def("split.str(str self, str? separator=None, int max=-1) -> str[]"); - m.def("rsplit(str self, str separator=' ', int max=-1) -> str[]"); - m.def("join(str self, str[] values) -> str"); + // Implementations located in torch/csrc/jit/runtime/register_prim_ops.cpp + m.def(TORCH_SELECTIVE_SCHEMA("aten::splitlines(str self, bool keepends=False) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isupper(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::islower(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::capitalize(str self) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::title(str self) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::center(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::count(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::endswith(str self, str substr, int start=0, int end=-1) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::startswith(str self, str substr, int start=0, int end=-1) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::expandtabs(str self, int tabsize=8) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::find(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rfind(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::index.str(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rindex(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isidentifier(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::istitle(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isprintable(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::ljust(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rjust(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::zfill(str self, int width) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::lstrip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rstrip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::strip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::replace(str self, str old, str new, int max=-1) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::partition(str self, str separator) -> (str, str, str)")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rpartition(str self, str separator) -> (str, str, str)")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::split.str(str self, str? separator=None, int max=-1) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str")); // Integer Ops // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 3f5126358804..b1a2967f5dea 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -302,7 +302,6 @@ jit_sources_full = [ "torch/csrc/jit/runtime/register_prim_ops.cpp", "torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp", "torch/csrc/jit/runtime/register_special_ops.cpp", - "torch/csrc/jit/runtime/register_string_ops.cpp", "torch/csrc/jit/passes/remove_inplace_ops.cpp", "torch/csrc/jit/passes/utils/check_alias_annotation.cpp", ] diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index ed3e2aceb19a..98f328a43240 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -75,59 +76,6 @@ c10::List splitNoneSeparator(const std::string& string) { return splits; } -TORCH_LIBRARY_IMPL(aten, CatchAll, m) { - m.impl("slice.str", TORCH_FN(stringSlice)); - m.impl("strip", [](std::string string, const std::string& chars) { - auto rindex = string.find_last_not_of(chars); - if (rindex != std::string::npos) { - string = string.substr(0, rindex + 1); - } else { - string = ""; - } - auto lindex = string.find_first_not_of(chars); - if (lindex != std::string::npos) { - string = string.substr(lindex, string.size()); - } else { - string = ""; - } - return string; - }); - m.impl( - "split.str", - [](const std::string& string, - c10::optional separator, - int64_t max) { - if (!separator.has_value()) { - // if separator is not specified, - // a different splitting algorithm is applied as Python - return splitNoneSeparator(string); - ; - } - if (separator.value().empty()) { - throw std::runtime_error("ValueError: empty separator"); - } - - std::string::size_type prev_pos = 0; - std::string::size_type pos = 0; - c10::List splits; - auto count = 0; - - while ((pos = string.find(separator.value(), pos)) != - std::string::npos) { - count++; - if (max >= 0 && count > max) { - break; - } else { - splits.emplace_back(string.substr(prev_pos, pos - prev_pos)); - } - pos += separator.value().size(); - prev_pos = pos; - } - splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos)); - return splits; - }); -} - RegisterOperators reg( {OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"), @@ -1238,6 +1186,573 @@ RegisterOperators reg_dict_ops({ CREATE_DICT_OPS("Tensor"), }); +c10::AliasAnalysisKind aliasAnalysisFromSchema() { + return c10::AliasAnalysisKind::FROM_SCHEMA; +} + +// Convert an python index (which may be negative) into an index usable for a +// C++ container +int64_t normalizeIndex(int64_t idx, int64_t list_size) { + if (idx < 0) { + // Handle negative indexing + idx = list_size + idx; + } + return idx; +} + +int64_t stringFindImpl( + std::string string, + std::string substr, + int64_t start, + int64_t end, + bool reverse = false) { + int64_t size = string.size(); + if (start < 0) { + start = std::max(int64_t(0), int64_t(size + start)); + } + if (end < 0) { + end = std::max(int64_t(0), int64_t(size + end + 1)); + } + if (end > start) { + string = string.substr(start, end - start); + } else { + string = ""; + } + + int64_t result = -1; + if (string.size() >= substr.size()) { + auto pos = string.find(substr, 0); + if (reverse) { + auto rpos = pos; + do { + pos = rpos; + rpos = string.find(substr, pos + 1); + } while (rpos != std::string::npos); + } + if (pos != std::string::npos) { + result = pos + start; + } + } + return result; +} + +// String Ops +// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp +TORCH_LIBRARY_IMPL(aten, CatchAll, m) { + m.impl(TORCH_SELECTIVE_NAME("aten::slice.str"), TORCH_FN(stringSlice)); + m.impl( + TORCH_SELECTIVE_NAME("aten::strip"), + [](std::string string, const std::string& chars) { + auto rindex = string.find_last_not_of(chars); + if (rindex != std::string::npos) { + string = string.substr(0, rindex + 1); + } else { + string = ""; + } + auto lindex = string.find_first_not_of(chars); + if (lindex != std::string::npos) { + string = string.substr(lindex, string.size()); + } else { + string = ""; + } + return string; + }); + m.impl( + TORCH_SELECTIVE_NAME("aten::split.str"), + [](const std::string& string, + c10::optional separator, + int64_t max) { + if (!separator.has_value()) { + // if separator is not specified, + // a different splitting algorithm is applied as Python + return splitNoneSeparator(string); + ; + } + if (separator.value().empty()) { + throw std::runtime_error("ValueError: empty separator"); + } + + std::string::size_type prev_pos = 0; + std::string::size_type pos = 0; + c10::List splits; + auto count = 0; + + while ((pos = string.find(separator.value(), pos)) != + std::string::npos) { + count++; + if (max >= 0 && count > max) { + break; + } else { + splits.emplace_back(string.substr(prev_pos, pos - prev_pos)); + } + pos += separator.value().size(); + prev_pos = pos; + } + splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos)); + return splits; + }); + m.impl( + TORCH_SELECTIVE_NAME("aten::splitlines"), + [](std::string string, bool keepends) { + std::string delimiters = + "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029"; + c10::List splits; + + std::string::size_type prev_pos = 0; + std::string::size_type pos = 0; + while ((pos = string.find_first_of(delimiters, pos)) != + std::string::npos) { + splits.emplace_back(string.substr(prev_pos, pos - prev_pos)); + if (keepends) { + splits.emplace_back(string.substr(pos, 1)); + } + pos++; + prev_pos = pos; + } + if (prev_pos != string.size()) { + splits.emplace_back( + string.substr(prev_pos, string.size() - prev_pos)); + } + + return splits; + }); + + // upper and lower require there to be at least one alpha character, + // and ignore all other characters + m.impl(TORCH_SELECTIVE_NAME("aten::isupper"), [](std::string string) { + bool found_alpha = false; + bool is_upper = true; + for (size_t i = 0; i < string.size() && is_upper; ++i) { + char c = string[i]; + found_alpha |= static_cast(::isalpha(c)); + is_upper &= (!::isalpha(c) || ::isupper(c)); + } + return found_alpha && is_upper; + }); + m.impl(TORCH_SELECTIVE_NAME("aten::islower"), [](std::string string) { + bool found_alpha = false; + bool is_lower = true; + for (size_t i = 0; i < string.size() && is_lower; ++i) { + char c = string[i]; + found_alpha |= static_cast(::isalpha(c)); + is_lower &= (!::isalpha(c) || ::islower(c)); + } + return found_alpha && is_lower; + }); + + m.impl(TORCH_SELECTIVE_NAME("aten::capitalize"), [](std::string string) { + std::stringstream ss; + auto first_char = true; + for (char c : string) { + if (first_char) { + ss << static_cast(::toupper(c)); + first_char = false; + } else { + ss << static_cast(::tolower(c)); + } + } + return ss.str(); + }); + + m.impl(TORCH_SELECTIVE_NAME("aten::title"), [](std::string string) { + std::stringstream ss; + bool prev_is_nonalpha = true; + for (char c : string) { + if (prev_is_nonalpha) { + ss << static_cast(::toupper(c)); + } else { + ss << static_cast(::tolower(c)); + } + if (::isalpha(c)) { + prev_is_nonalpha = false; + } else { + prev_is_nonalpha = true; + } + } + return ss.str(); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::center"), + [](std::string string, int64_t width, std::string fillchar) { + if (fillchar.size() != 1) { + // TODO: this should be a TypeError + throw std::runtime_error( + "TypeError: The fill character must be exactly one character long"); + } + if (string.size() > static_cast(width)) { + return string; + } + std::stringstream ss; + std::string::size_type full_padding = width - string.size(); + std::string::size_type l_pad = full_padding / 2; + std::string::size_type r_pad = (full_padding + 1) / 2; + if (width % 2) { + auto tmp = r_pad; + r_pad = l_pad; + l_pad = tmp; + } + for (std::string::size_type i = 0; i < l_pad; ++i) { + ss << fillchar; + } + ss << string; + for (std::string::size_type i = 0; i < r_pad; ++i) { + ss << fillchar; + } + return ss.str(); + }); + + // Adapted from + // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string + m.impl( + TORCH_SELECTIVE_NAME("aten::count"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + int64_t size = string.size(); + if (start > size) { + return int64_t(0); + } + if (start < 0) { + start = std::max(int64_t(0), int64_t(size + start)); + } + if (end < 0) { + end = std::max(int64_t(0), int64_t(size + end + 1)); + } + + int64_t occurrences = 0; + std::string::size_type pos = start; + while ((pos = string.find(substr, pos)) != std::string::npos) { + if (pos < static_cast(end)) { + ++occurrences; + } else { + break; + } + pos += substr.length(); + } + return occurrences; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::endswith"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + int64_t size = string.size(); + if (start < 0) { + start = std::max(int64_t(0), int64_t(size + start)); + } + if (end < 0) { + end = std::max(int64_t(0), int64_t(size + end + 1)); + } + + string = string.substr(start, end - start); + + auto result = false; + if (string.length() >= substr.length()) { + result = !string.compare( + string.length() - substr.length(), substr.length(), substr); + } + return result; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::startswith"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + int64_t size = string.size(); + if (start < 0) { + start = std::max(int64_t(0), int64_t(size + start)); + } + if (end < 0) { + end = std::max(int64_t(0), int64_t(size + end + 1)); + } + + string = string.substr(start, end - start); + + auto result = false; + if (string.length() >= substr.length()) { + result = !string.compare(0, substr.length(), substr); + } + return result; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::expandtabs"), + [](std::string string, int64_t tabsize) { + std::stringstream ss; + size_t index = 0; + for (const auto& c : string) { + if (c != '\t') { + ss << c; + index++; + } else { + if (tabsize <= 0) { + continue; + } + do { + ss << ' '; + index++; + } while (index % tabsize); + } + } + return ss.str(); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::find"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + return stringFindImpl(string, substr, start, end); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rfind"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + return stringFindImpl(string, substr, start, end, true); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::index.str"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + auto result = stringFindImpl(string, substr, start, end); + if (result < 0) { + throw std::runtime_error("ValueError: substring not found"); + } + return result; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rindex"), + [](std::string string, std::string substr, int64_t start, int64_t end) { + auto result = stringFindImpl(string, substr, start, end, true); + if (result < 0) { + throw std::runtime_error("ValueError: substring not found"); + } + return result; + }); + + m.impl(TORCH_SELECTIVE_NAME("aten::isidentifier"), [](std::string string) { + LOG(WARNING) + << "The isidentifier() implementation being used is from Python 2\n"; + if (string.size() < 1) { + return false; + } + if (::isdigit(string[0])) { + return false; + } + auto result = std::all_of( + string.begin(), string.end(), [](char c) { return ::isalnum(c); }); + return result; + }); + + m.impl(TORCH_SELECTIVE_NAME("aten::istitle"), [](std::string string) { + auto result = false; + + bool prev_is_alpha = false; + for (char c : string) { + if (prev_is_alpha) { + if (c != static_cast(::tolower(c))) { + result = false; + break; + } + } else { + if (c != static_cast(::toupper(c))) { + result = false; + break; + } + // Only true if there exists at least one alpha + if (::isalpha(c)) { + result = true; + } + } + if (::isalpha(c)) { + prev_is_alpha = true; + } else { + prev_is_alpha = false; + } + } + return result; + }); + + // Can't reuse DEFINE_STRING_IS_OP because "" is printable + m.impl(TORCH_SELECTIVE_NAME("aten::isprintable"), [](std::string string) { + auto result = std::all_of(string.begin(), string.end(), [](char c) { + return ::isalnum(c) || ::ispunct(c) || c == ' '; + }); + return result; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::ljust"), + [](std::string string, int64_t width, std::string fillchar) { + if (fillchar.size() != 1) { + // TODO: this should be a TypeError + throw std::runtime_error( + "TypeError: The fill character must be exactly one character long"); + } + auto to_append = + std::max(int64_t(0), width - static_cast(string.size())); + + std::stringstream ss; + ss << string; + for (auto i = 0; i < to_append; ++i) { + ss << fillchar; + } + + return ss.str(); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rjust"), + [](std::string string, int64_t width, std::string fillchar) { + if (fillchar.size() != 1) { + // TODO: this should be a TypeError + throw std::runtime_error( + "TypeError: The fill character must be exactly one character long"); + } + auto to_append = + std::max(int64_t(0), width - static_cast(string.size())); + + std::stringstream ss; + for (auto i = 0; i < to_append; ++i) { + ss << fillchar; + } + ss << string; + return ss.str(); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::zfill"), + [](std::string string, int64_t width) { + auto to_append = + std::max(int64_t(0), width - static_cast(string.size())); + + std::stringstream ss; + for (auto i = 0; i < to_append; ++i) { + ss << '0'; + } + ss << string; + + return ss.str(); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::lstrip"), + [](std::string string, std::string chars) { + auto index = string.find_first_not_of(chars); + if (index != std::string::npos) { + string = string.substr(index, string.size()); + } else { + string = ""; + } + return string; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rstrip"), + [](std::string string, std::string chars) { + auto index = string.find_last_not_of(chars); + if (index != std::string::npos) { + string = string.substr(0, index + 1); + } else { + string = ""; + } + return string; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::replace"), + [](std::string string, + std::string old_str, + std::string new_str, + int64_t max) { + int64_t occurrences = 0; + std::string::size_type pos = 0; + while ((pos = string.find(old_str, pos)) != std::string::npos) { + if (max >= 0 && ++occurrences > max) { + break; + } + string = string.replace(pos, old_str.length(), new_str); + pos += new_str.length(); + } + + return string; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::partition"), + [](std::string string, std::string separator) { + auto pos = string.find(separator, 0); + if (pos == std::string::npos) { + pos = string.size(); + separator = ""; + } + auto pre_partition = string.substr(0, pos); + auto post_partition = + string.substr(pos + separator.size(), string.size()); + + return std::make_tuple(pre_partition, separator, post_partition); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rpartition"), + [](std::string string, std::string separator) { + auto pos = string.find(separator, 0); + auto rpos = pos; + do { + pos = rpos; + rpos = string.find(separator, pos + 1); + } while (rpos != std::string::npos); + + if (pos == std::string::npos) { + pos = 0; + separator = ""; + } + + auto pre_partition = string.substr(0, pos); + auto post_partition = + string.substr(pos + separator.size(), string.size()); + + return std::make_tuple(pre_partition, separator, post_partition); + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::rsplit"), + [](std::string string, std::string separator, int64_t max) { + std::reverse(separator.begin(), separator.end()); + std::reverse(string.begin(), string.end()); + + std::string::size_type prev_pos = 0; + std::string::size_type pos = 0; + c10::List splits; + auto count = 0; + while ((pos = string.find(separator, pos)) != std::string::npos) { + count++; + if (max >= 0 && count > max) { + break; + } else { + auto substr = string.substr(prev_pos, pos - prev_pos); + std::reverse(substr.begin(), substr.end()); + splits.emplace(splits.begin(), substr); + } + pos += separator.size(); + prev_pos = pos; + } + auto substr = string.substr(prev_pos, string.size() - prev_pos); + std::reverse(substr.begin(), substr.end()); + splits.emplace(splits.begin(), substr); + return splits; + }); + + m.impl( + TORCH_SELECTIVE_NAME("aten::join"), + [](const std::string& string, const c10::List& values) { + std::stringstream ss; + for (auto it = values.begin(); it != values.end(); ++it) { + ss << static_cast(*it); + if (it != values.end() - 1) { + ss << string; + } + } + return ss.str(); + }); +} + } // namespace } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/runtime/register_string_ops.cpp b/torch/csrc/jit/runtime/register_string_ops.cpp deleted file mode 100644 index 244893b10393..000000000000 --- a/torch/csrc/jit/runtime/register_string_ops.cpp +++ /dev/null @@ -1,499 +0,0 @@ -#include -#include -#include - -namespace torch { -namespace jit { -namespace { - -c10::AliasAnalysisKind aliasAnalysisFromSchema() { - return c10::AliasAnalysisKind::FROM_SCHEMA; -} - -// Convert an python index (which may be negative) into an index usable for a -// C++ container -int64_t normalizeIndex(int64_t idx, int64_t list_size) { - if (idx < 0) { - // Handle negative indexing - idx = list_size + idx; - } - return idx; -} - -int64_t stringFindImpl( - std::string string, - std::string substr, - int64_t start, - int64_t end, - bool reverse = false) { - int64_t size = string.size(); - if (start < 0) { - start = std::max(int64_t(0), int64_t(size + start)); - } - if (end < 0) { - end = std::max(int64_t(0), int64_t(size + end + 1)); - } - if (end > start) { - string = string.substr(start, end - start); - } else { - string = ""; - } - - int64_t result = -1; - if (string.size() >= substr.size()) { - auto pos = string.find(substr, 0); - if (reverse) { - auto rpos = pos; - do { - pos = rpos; - rpos = string.find(substr, pos + 1); - } while (rpos != std::string::npos); - } - if (pos != std::string::npos) { - result = pos + start; - } - } - return result; -} - -// String Ops -// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp -TORCH_LIBRARY_IMPL(aten, CatchAll, m) { - m.impl("splitlines", [](std::string string, bool keepends) { - std::string delimiters = "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029"; - c10::List splits; - - std::string::size_type prev_pos = 0; - std::string::size_type pos = 0; - while ((pos = string.find_first_of(delimiters, pos)) != std::string::npos) { - splits.emplace_back(string.substr(prev_pos, pos - prev_pos)); - if (keepends) { - splits.emplace_back(string.substr(pos, 1)); - } - pos++; - prev_pos = pos; - } - if (prev_pos != string.size()) { - splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos)); - } - - return splits; - }); - - // upper and lower require there to be at least one alpha character, - // and ignore all other characters - m.impl("isupper", [](std::string string) { - bool found_alpha = false; - bool is_upper = true; - for (size_t i = 0; i < string.size() && is_upper; ++i) { - char c = string[i]; - found_alpha |= static_cast(::isalpha(c)); - is_upper &= (!::isalpha(c) || ::isupper(c)); - } - return found_alpha && is_upper; - }); - m.impl("islower", [](std::string string) { - bool found_alpha = false; - bool is_lower = true; - for (size_t i = 0; i < string.size() && is_lower; ++i) { - char c = string[i]; - found_alpha |= static_cast(::isalpha(c)); - is_lower &= (!::isalpha(c) || ::islower(c)); - } - return found_alpha && is_lower; - }); - - m.impl("capitalize", [](std::string string) { - std::stringstream ss; - auto first_char = true; - for (char c : string) { - if (first_char) { - ss << static_cast(::toupper(c)); - first_char = false; - } else { - ss << static_cast(::tolower(c)); - } - } - return ss.str(); - }); - - m.impl("title", [](std::string string) { - std::stringstream ss; - bool prev_is_nonalpha = true; - for (char c : string) { - if (prev_is_nonalpha) { - ss << static_cast(::toupper(c)); - } else { - ss << static_cast(::tolower(c)); - } - if (::isalpha(c)) { - prev_is_nonalpha = false; - } else { - prev_is_nonalpha = true; - } - } - return ss.str(); - }); - - m.impl("center", [](std::string string, int64_t width, std::string fillchar) { - if (fillchar.size() != 1) { - // TODO: this should be a TypeError - throw std::runtime_error( - "TypeError: The fill character must be exactly one character long"); - } - if (string.size() > static_cast(width)) { - return string; - } - std::stringstream ss; - std::string::size_type full_padding = width - string.size(); - std::string::size_type l_pad = full_padding / 2; - std::string::size_type r_pad = (full_padding + 1) / 2; - if (width % 2) { - auto tmp = r_pad; - r_pad = l_pad; - l_pad = tmp; - } - for (std::string::size_type i = 0; i < l_pad; ++i) { - ss << fillchar; - } - ss << string; - for (std::string::size_type i = 0; i < r_pad; ++i) { - ss << fillchar; - } - return ss.str(); - }); - - // Adapted from - // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string - m.impl( - "count", - [](std::string string, std::string substr, int64_t start, int64_t end) { - int64_t size = string.size(); - if (start > size) { - return int64_t(0); - } - if (start < 0) { - start = std::max(int64_t(0), int64_t(size + start)); - } - if (end < 0) { - end = std::max(int64_t(0), int64_t(size + end + 1)); - } - - int64_t occurrences = 0; - std::string::size_type pos = start; - while ((pos = string.find(substr, pos)) != std::string::npos) { - if (pos < static_cast(end)) { - ++occurrences; - } else { - break; - } - pos += substr.length(); - } - return occurrences; - }); - - m.impl( - "endswith", - [](std::string string, std::string substr, int64_t start, int64_t end) { - int64_t size = string.size(); - if (start < 0) { - start = std::max(int64_t(0), int64_t(size + start)); - } - if (end < 0) { - end = std::max(int64_t(0), int64_t(size + end + 1)); - } - - string = string.substr(start, end - start); - - auto result = false; - if (string.length() >= substr.length()) { - result = !string.compare( - string.length() - substr.length(), substr.length(), substr); - } - return result; - }); - - m.impl( - "startswith", - [](std::string string, std::string substr, int64_t start, int64_t end) { - int64_t size = string.size(); - if (start < 0) { - start = std::max(int64_t(0), int64_t(size + start)); - } - if (end < 0) { - end = std::max(int64_t(0), int64_t(size + end + 1)); - } - - string = string.substr(start, end - start); - - auto result = false; - if (string.length() >= substr.length()) { - result = !string.compare(0, substr.length(), substr); - } - return result; - }); - - m.impl("expandtabs", [](std::string string, int64_t tabsize) { - std::stringstream ss; - size_t index = 0; - for (const auto& c : string) { - if (c != '\t') { - ss << c; - index++; - } else { - if (tabsize <= 0) { - continue; - } - do { - ss << ' '; - index++; - } while (index % tabsize); - } - } - return ss.str(); - }); - - m.impl( - "find", - [](std::string string, std::string substr, int64_t start, int64_t end) { - return stringFindImpl(string, substr, start, end); - }); - - m.impl( - "rfind", - [](std::string string, std::string substr, int64_t start, int64_t end) { - return stringFindImpl(string, substr, start, end, true); - }); - - m.impl( - "index.str", - [](std::string string, std::string substr, int64_t start, int64_t end) { - auto result = stringFindImpl(string, substr, start, end); - if (result < 0) { - throw std::runtime_error("ValueError: substring not found"); - } - return result; - }); - - m.impl( - "rindex", - [](std::string string, std::string substr, int64_t start, int64_t end) { - auto result = stringFindImpl(string, substr, start, end, true); - if (result < 0) { - throw std::runtime_error("ValueError: substring not found"); - } - return result; - }); - - m.impl("isidentifier", [](std::string string) { - LOG(WARNING) - << "The isidentifier() implementation being used is from Python 2\n"; - if (string.size() < 1) { - return false; - } - if (::isdigit(string[0])) { - return false; - } - auto result = std::all_of( - string.begin(), string.end(), [](char c) { return ::isalnum(c); }); - return result; - }); - - m.impl("istitle", [](std::string string) { - auto result = false; - - bool prev_is_alpha = false; - for (char c : string) { - if (prev_is_alpha) { - if (c != static_cast(::tolower(c))) { - result = false; - break; - } - } else { - if (c != static_cast(::toupper(c))) { - result = false; - break; - } - // Only true if there exists at least one alpha - if (::isalpha(c)) { - result = true; - } - } - if (::isalpha(c)) { - prev_is_alpha = true; - } else { - prev_is_alpha = false; - } - } - return result; - }); - - // Can't reuse DEFINE_STRING_IS_OP because "" is printable - m.impl("isprintable", [](std::string string) { - auto result = std::all_of(string.begin(), string.end(), [](char c) { - return ::isalnum(c) || ::ispunct(c) || c == ' '; - }); - return result; - }); - - m.impl("ljust", [](std::string string, int64_t width, std::string fillchar) { - if (fillchar.size() != 1) { - // TODO: this should be a TypeError - throw std::runtime_error( - "TypeError: The fill character must be exactly one character long"); - } - auto to_append = - std::max(int64_t(0), width - static_cast(string.size())); - - std::stringstream ss; - ss << string; - for (auto i = 0; i < to_append; ++i) { - ss << fillchar; - } - - return ss.str(); - }); - - m.impl("rjust", [](std::string string, int64_t width, std::string fillchar) { - if (fillchar.size() != 1) { - // TODO: this should be a TypeError - throw std::runtime_error( - "TypeError: The fill character must be exactly one character long"); - } - auto to_append = - std::max(int64_t(0), width - static_cast(string.size())); - - std::stringstream ss; - for (auto i = 0; i < to_append; ++i) { - ss << fillchar; - } - ss << string; - return ss.str(); - }); - - m.impl("zfill", [](std::string string, int64_t width) { - auto to_append = - std::max(int64_t(0), width - static_cast(string.size())); - - std::stringstream ss; - for (auto i = 0; i < to_append; ++i) { - ss << '0'; - } - ss << string; - - return ss.str(); - }); - - m.impl("lstrip", [](std::string string, std::string chars) { - auto index = string.find_first_not_of(chars); - if (index != std::string::npos) { - string = string.substr(index, string.size()); - } else { - string = ""; - } - return string; - }); - - m.impl("rstrip", [](std::string string, std::string chars) { - auto index = string.find_last_not_of(chars); - if (index != std::string::npos) { - string = string.substr(0, index + 1); - } else { - string = ""; - } - return string; - }); - - m.impl( - "replace", - [](std::string string, - std::string old_str, - std::string new_str, - int64_t max) { - int64_t occurrences = 0; - std::string::size_type pos = 0; - while ((pos = string.find(old_str, pos)) != std::string::npos) { - if (max >= 0 && ++occurrences > max) { - break; - } - string = string.replace(pos, old_str.length(), new_str); - pos += new_str.length(); - } - - return string; - }); - - m.impl("partition", [](std::string string, std::string separator) { - auto pos = string.find(separator, 0); - if (pos == std::string::npos) { - pos = string.size(); - separator = ""; - } - auto pre_partition = string.substr(0, pos); - auto post_partition = string.substr(pos + separator.size(), string.size()); - - return std::make_tuple(pre_partition, separator, post_partition); - }); - - m.impl("rpartition", [](std::string string, std::string separator) { - auto pos = string.find(separator, 0); - auto rpos = pos; - do { - pos = rpos; - rpos = string.find(separator, pos + 1); - } while (rpos != std::string::npos); - - if (pos == std::string::npos) { - pos = 0; - separator = ""; - } - - auto pre_partition = string.substr(0, pos); - auto post_partition = string.substr(pos + separator.size(), string.size()); - - return std::make_tuple(pre_partition, separator, post_partition); - }); - - m.impl("rsplit", [](std::string string, std::string separator, int64_t max) { - std::reverse(separator.begin(), separator.end()); - std::reverse(string.begin(), string.end()); - - std::string::size_type prev_pos = 0; - std::string::size_type pos = 0; - c10::List splits; - auto count = 0; - while ((pos = string.find(separator, pos)) != std::string::npos) { - count++; - if (max >= 0 && count > max) { - break; - } else { - auto substr = string.substr(prev_pos, pos - prev_pos); - std::reverse(substr.begin(), substr.end()); - splits.emplace(splits.begin(), substr); - } - pos += separator.size(); - prev_pos = pos; - } - auto substr = string.substr(prev_pos, string.size() - prev_pos); - std::reverse(substr.begin(), substr.end()); - splits.emplace(splits.begin(), substr); - return splits; - }); - - m.impl( - "join", - [](const std::string& string, const c10::List& values) { - std::stringstream ss; - for (auto it = values.begin(); it != values.end(); ++it) { - ss << static_cast(*it); - if (it != values.end() - 1) { - ss << string; - } - } - return ss.str(); - }); -} - -} // namespace -} // namespace jit -} // namespace torch From 94c3cdd994dfe276b89ffb412cfe8a80b2f37bc7 Mon Sep 17 00:00:00 2001 From: Shen Li Date: Wed, 23 Sep 2020 08:04:27 -0700 Subject: [PATCH 050/449] Let rpc._all_gather use default RPC timeout (#44983) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44983 `_all_gather` was converted from `_wait_all_workers` and inherited its 5 seconds fixed timeout. As `_all_gather` meant to support a broader set of use cases, the timeout configuration should be more flexible. This PR makes `rpc._all_gather` use the global default RPC timeout. Test Plan: Imported from OSS Reviewed By: pritamdamania87 Differential Revision: D23794383 Pulled By: mrshenli fbshipit-source-id: 382f52c375f0f25c032c5abfc910f72baf4c5ad9 --- torch/csrc/distributed/rpc/init.cpp | 1 - .../distributed/rpc/process_group_agent.cpp | 8 +----- torch/csrc/distributed/rpc/rpc_agent.h | 3 +++ .../csrc/distributed/rpc/tensorpipe_agent.cpp | 8 ++---- .../testing/faulty_process_group_agent.cpp | 4 --- torch/distributed/rpc/api.py | 17 ++++++------ torch/distributed/rpc/constants.py | 1 + .../_internal/distributed/rpc/rpc_test.py | 27 +++++++++++++++++++ 8 files changed, 42 insertions(+), 27 deletions(-) diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp index ea1db04225c7..34023afdce91 100644 --- a/torch/csrc/distributed/rpc/init.cpp +++ b/torch/csrc/distributed/rpc/init.cpp @@ -27,7 +27,6 @@ namespace rpc { namespace { constexpr std::chrono::milliseconds kDeleteAllUsersTimeout(100000); -constexpr float kSecToMsConversion = 1000; template using shared_ptr_class_ = py::class_>; diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp index fe93e43d01f3..d97577724a55 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/process_group_agent.cpp @@ -8,12 +8,6 @@ namespace torch { namespace distributed { namespace rpc { -const std::string kRPCTimeoutErrorStr = - "RPC ran for more than {} milliseconds and timed out."; - -namespace { -constexpr auto kSecToMsConversion = 1000; -} ////////////////////////// MessageCounter ///////////////////////////////// @@ -802,7 +796,7 @@ void ProcessGroupAgent::pollTimedOutRPCs() { for (const auto& timedOutFuture : timedOutFutures) { auto errStr = - fmt::format(kRPCTimeoutErrorStr, timedOutFuture.timeout_.count()); + fmt::format(kRpcTimeoutErrorStr, timedOutFuture.timeout_.count()); auto err = makeRPCError(errStr, RPCErrorType::TIMEOUT); if (!timedOutFuture.future_->hasError()) { diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index 605744a1f227..34b77a085510 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -17,6 +17,9 @@ constexpr float kDefaultRpcTimeoutSeconds = 60; // timeout for RPCs. constexpr float kUnsetRpcTimeout = -1; constexpr auto kDefaultInitMethod = "env://"; +constexpr float kSecToMsConversion = 1000; +constexpr auto kRpcTimeoutErrorStr = + "RPC ran for more than set timeout ({} ms) and will now be marked with an error"; using steady_clock_time_point = std::chrono::time_point; diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index d9ce2c3b27eb..11c5408c2c35 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -22,16 +22,12 @@ namespace { const std::string kSocketIfnameEnvVar = "TP_SOCKET_IFNAME"; const std::string kDefaultUvAddress = "127.0.0.1"; -constexpr long kToMilliseconds = 1000; - const std::string kGilAverageWaitTime = "agent.gil_average_wait_time_us"; const std::string kThreadPoolSize = "agent.thread_pool_size"; const std::string kNumIdleThreads = "agent.num_idle_threads"; const std::string kClientActiveCalls = "agent.client_active_calls"; const std::string kServerActiveCalls = "agent.server_active_calls"; const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls"; -const std::string kRpcTimeoutErrorStr = - "RPC ran for more than set timeout ({} ms) and will now be marked with an error"; inline void checkCPUTensor(const torch::Tensor& tensor) { TORCH_CHECK( @@ -273,7 +269,7 @@ TensorPipeAgent::TensorPipeAgent( WorkerInfo(std::move(selfName), selfId), std::move(cb), std::chrono::milliseconds( - (long)(opts.rpcTimeoutSeconds * kToMilliseconds))), + (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))), opts_(std::move(opts)), threadPool_(opts_.numWorkerThreads), context_(std::make_shared( @@ -685,7 +681,7 @@ std::shared_ptr TensorPipeAgent::send( auto timeout = rpcTimeoutSeconds == kUnsetRpcTimeout ? getRpcTimeout() : std::chrono::milliseconds( - static_cast(rpcTimeoutSeconds * kToMilliseconds)); + static_cast(rpcTimeoutSeconds * kSecToMsConversion)); // We only add to the timeoutMap_ if the timeout is not 0. Per our // documentation, a user-provided timeout of 0 indicates the RPC should never diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp index a03ff5cafecd..a1be688a285e 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp @@ -6,10 +6,6 @@ namespace torch { namespace distributed { namespace rpc { -namespace { -constexpr auto kSecToMsConversion = 1000; -} - std::string fromVec(const std::vector& vec) { return std::string(vec.begin(), vec.end()); } diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py index af28e6023c60..d1b62a5b0ab4 100644 --- a/torch/distributed/rpc/api.py +++ b/torch/distributed/rpc/api.py @@ -12,6 +12,7 @@ PyRRef, RemoteProfilerManager, WorkerInfo, + get_rpc_timeout, _cleanup_python_rpc_handler, _delete_all_user_and_unforked_owner_rrefs, _destroy_rref_context, @@ -34,7 +35,7 @@ _build_rpc_profiling_key, ) -from .constants import UNSET_RPC_TIMEOUT +from .constants import DEFAULT_SHUTDOWN_TIMEOUT, UNSET_RPC_TIMEOUT logger = logging.getLogger(__name__) @@ -142,7 +143,7 @@ def _broadcast_to_followers(sequence_id, objects_map): @_require_initialized -def _all_gather(obj): +def _all_gather(obj, timeout=UNSET_RPC_TIMEOUT): r""" This is similar to torch.distributed.all_gather(), but is using RPC. It picks the worker with the smallest name (alphabetic order) as the leader. @@ -163,8 +164,8 @@ def _all_gather(obj): _all_gather_sequence_id += 1 is_leader = leader_name == self_name - # Set a long enough timeout for all shutdown messages to be processed. - timeout = 5 # second + if timeout == UNSET_RPC_TIMEOUT: + timeout = get_rpc_timeout() # Phase 1: Followers send it's object to the leader if is_leader: @@ -178,9 +179,7 @@ def _all_gather(obj): ) with _all_gather_dict_lock: - states = _all_gather_sequence_id_to_states[ - sequence_id - ] + states = _all_gather_sequence_id_to_states[sequence_id] states.proceed_signal.wait() # Phase 2: Leader broadcast gathered results to all followers @@ -207,7 +206,7 @@ def _all_gather(obj): if errors: raise RuntimeError( f"Followers {[e[0] for e in errors]} timed out in _all_gather " - f"after {timeout} seconds. The first exception is {errors[0][1]}" + f"after {timeout:.2f} seconds. The first exception is {errors[0][1]}" ) return states.gathered_objects @@ -223,7 +222,7 @@ def _wait_all_workers(): framework will work after this method returns. """ try: - _all_gather(None) + _all_gather(None, timeout=DEFAULT_SHUTDOWN_TIMEOUT) except RuntimeError as ex: logger.error( f"Failed to respond to 'Shutdown Proceed' in time, got error {ex}" diff --git a/torch/distributed/rpc/constants.py b/torch/distributed/rpc/constants.py index ecd9552ce40b..c2dd804e4c81 100644 --- a/torch/distributed/rpc/constants.py +++ b/torch/distributed/rpc/constants.py @@ -12,6 +12,7 @@ # For any RpcAgent. DEFAULT_RPC_TIMEOUT_SEC = _DEFAULT_RPC_TIMEOUT_SEC DEFAULT_INIT_METHOD = _DEFAULT_INIT_METHOD +DEFAULT_SHUTDOWN_TIMEOUT = 5.0 # For ProcessGroupAgent. DEFAULT_NUM_SEND_RECV_THREADS = _DEFAULT_NUM_SEND_RECV_THREADS diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index e343ffc1939b..797e5a010b86 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -131,6 +131,19 @@ def set(self, val): self.t = val +class SlowPickleClass: + def __init__(self, t): + self.t = t + + def __getstate__(self): + time.sleep(self.t) + return (self.t, ) + + def __setstate__(self, obj): + self.t = obj[0] + time.sleep(self.t) + + class MyClass: def __init__(self, a): self.a = a @@ -931,6 +944,20 @@ def test_all_gather(self): self.assertEqual(expected, results) + @dist_init + def test_all_gather_timeout(self): + rpc._set_rpc_timeout(0.1) + + if self.rank == 0: + with self.assertRaisesRegex( + RuntimeError, + "timed out in _all_gather after 0\\.10 seconds" + ): + rpc.api._all_gather(SlowPickleClass(0.5)) + else: + with self.assertRaisesRegex(RuntimeError, "timeout.*100 ms"): + rpc.api._all_gather(SlowPickleClass(0.5)) + @dist_init def test_graceful_shutdown_with_uneven_workload(self): """Test graceful termination.""" From 5b20bf4fd97c9acd899ea780cda8738aa775cdec Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Wed, 23 Sep 2020 08:24:08 -0700 Subject: [PATCH 051/449] Added support for complex input for Cholesky decomposition (#44895) Summary: Cholesky decomposition now works for complex inputs. Fixes https://github.com/pytorch/pytorch/issues/44637. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44895 Reviewed By: ailzhang Differential Revision: D23841583 Pulled By: anjali411 fbshipit-source-id: 3b1f34a7af17827884540696f8771a0d5b1df478 --- .../ATen/native/cuda/BatchLinearAlgebra.cu | 36 ++++++++++++++- test/test_torch.py | 45 +++++++++++++++---- 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 5394c2a23239..c86f355a67c2 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -344,6 +344,24 @@ void magmaCholesky( AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaCholesky>( + magma_uplo_t uplo, magma_int_t n, c10::complex* dA, + magma_int_t ldda, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_zpotrf_gpu(uplo, n, reinterpret_cast(dA), ldda, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaCholesky>( + magma_uplo_t uplo, magma_int_t n, c10::complex* dA, + magma_int_t ldda, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_cpotrf_gpu(uplo, n, reinterpret_cast(dA), ldda, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaCholeskyBatched( magma_uplo_t uplo, magma_int_t n, double** dA_array, magma_int_t ldda, @@ -360,6 +378,22 @@ void magmaCholeskyBatched( AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaCholeskyBatched>( + magma_uplo_t uplo, magma_int_t n, c10::complex** dA_array, magma_int_t ldda, + magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) { + magma_zpotrf_batched(uplo, n, reinterpret_cast(dA_array), ldda, info_array, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaCholeskyBatched>( + magma_uplo_t uplo, magma_int_t n, c10::complex** dA_array, magma_int_t ldda, + magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) { + magma_cpotrf_batched(uplo, n, reinterpret_cast(dA_array), ldda, info_array, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaTriangularSolve( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, @@ -904,7 +938,7 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) { self_working_copy = cloneBatchedColumnMajor(self); } - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cuda", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_cuda", [&]{ apply_cholesky(self_working_copy, false, infos); }); if (self.dim() > 2) { diff --git a/test/test_torch.py b/test/test_torch.py index 6b529712ab5c..12a72c2f11c8 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -7739,14 +7739,29 @@ def cholesky_test_helper(n, batchsize, device, upper): for upper, batchsize in product([True, False], [262144, 524288]): cholesky_test_helper(2, batchsize, device, upper) + @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4}) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_cholesky_batched(self, device, dtype): - from torch.testing._internal.common_utils import random_symmetric_pd_matrix + from torch.testing._internal.common_utils import \ + (random_symmetric_pd_matrix, + random_fullrank_matrix_distinct_singular_value) def cholesky_test_helper(n, batch_dims, upper): - A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device) + # This is a workaround while there is no support for complex random_symmetric_pd_matrix + if dtype.is_complex: + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + A_real = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device) + A_imag = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device) + A = A_real + 1j * A_imag + # There is no support for complex batched matmul yet + matmul_list = [] + for mat in A.contiguous().view(-1, n, n): + matmul_list.append(mat @ mat.t().conj()) + A = torch.stack(matmul_list).view(*batch_dims, n, n) + else: + A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device) cholesky_exp = torch.stack([m.cholesky(upper=upper) for m in A.reshape(-1, n, n)]) cholesky_exp = cholesky_exp.reshape_as(A) self.assertEqual(cholesky_exp, torch.cholesky(A, upper=upper)) @@ -7754,26 +7769,38 @@ def cholesky_test_helper(n, batch_dims, upper): for upper, batchsize in product([True, False], [(3,), (3, 4), (2, 3, 4)]): cholesky_test_helper(3, batchsize, upper) + @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4}) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_cholesky(self, device, dtype): - x = torch.rand(10, 10, dtype=dtype, device=device) + 1e-1 - A = torch.mm(x, x.t()) + from torch.testing._internal.common_utils import \ + (random_symmetric_pd_matrix, + random_fullrank_matrix_distinct_singular_value) + + # This is a workaround while there is no support for complex random_symmetric_pd_matrix + if dtype.is_complex: + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + A_real = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device) + A_imag = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device) + A = A_real + 1j * A_imag + A = A @ A.t().conj() + else: + A = random_symmetric_pd_matrix(10, dtype=dtype, device=device) # default Case C = torch.cholesky(A) - B = torch.mm(C, C.t()) + B = torch.mm(C, C.t().conj()) self.assertEqual(A, B, atol=1e-14, rtol=0) # test Upper Triangular U = torch.cholesky(A, True) - B = torch.mm(U.t(), U) + B = torch.mm(U.t().conj(), U) self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (upper) did not allow rebuilding the original matrix') # test Lower Triangular L = torch.cholesky(A, False) - B = torch.mm(L, L.t()) + B = torch.mm(L, L.t().conj()) self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix') def test_view(self, device): From 9e30a76697ddedb46887a92559b317840bce6804 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 23 Sep 2020 09:47:05 -0700 Subject: [PATCH 052/449] Filter `strtod_l` is undeclared errors from sccache log (#45183) Summary: This prevents DrCI from misidentifying test failures for the compilation failures, such as: ``` /var/lib/jenkins/workspace/build/CMakeFiles/CMakeTmp/CheckSymbolExists.c:8:19: error: use of undeclared identifier \'strtod_l\' return ((int*)(&strtod_l))[argc]; ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/45183 Reviewed By: ezyang Differential Revision: D23859267 Pulled By: malfet fbshipit-source-id: 283d9bd2ab712f23239b72f3758d121e2d026fb0 --- .jenkins/pytorch/print_sccache_log.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py index c91472876c33..81c7e0752328 100644 --- a/.jenkins/pytorch/print_sccache_log.py +++ b/.jenkins/pytorch/print_sccache_log.py @@ -6,6 +6,7 @@ lines = f.readlines() for line in lines: - # Ignore errors from CPU instruction set testing - if 'src.c' not in line: + # Ignore errors from CPU instruction set or symbol existing testing + keywords = ['src.c', 'CheckSymbolExists.c'] + if all([keyword not in line for keyword in keywords]): print(line) From 9db38712889f049ed97d252db504dc598f696522 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Wed, 23 Sep 2020 10:48:42 -0700 Subject: [PATCH 053/449] Update true_divide_out to use at::. (#45079) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45079 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D23821701 Pulled By: ailzhang fbshipit-source-id: 562eac10faba7a503eda0029a0b026c1fb85fe1e --- aten/src/ATen/native/BinaryOps.cpp | 2 +- test/test_torch.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index fc55379578ff..cab77c25b885 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -175,7 +175,7 @@ Tensor& divide_(Tensor& self, Scalar other) { // true_divide, an alias for div Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) { - return native::div_out(result, self, divisor); + return at::div_out(result, self, divisor); } Tensor true_divide(const Tensor& self, const Tensor& divisor) { diff --git a/test/test_torch.py b/test/test_torch.py index 12a72c2f11c8..4b08697a908c 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -12681,7 +12681,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype): input.scatter_(0, index, src, reduce=operation) self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}") - @skipCUDAIfRocm + @skipCUDAIfRocm @onlyOnCPUAndCUDA @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() + torch.testing.get_all_int_dtypes())) @@ -16857,6 +16857,15 @@ def test_div(self, device, dtype): atol=0.01, rtol=0) self.assertEqual(method(a1, a2), op(a1, a2)) + @dtypes(torch.bfloat16, torch.float) + def test_true_divide_out(self, device, dtype): + a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device) + a2 = torch.tensor([2., 2.], dtype=dtype, device=device) + res = torch.empty_like(a1) + self.assertEqual(torch.true_divide(a1, a2, out=res), + torch.tensor([2.1, 3.1], dtype=dtype, device=device), + atol=0.01, rtol=0) + @onlyCUDA @dtypes(torch.half) def test_divmul_scalar(self, device, dtype): From a5a4924c2793b64bf68491c136d83f319c133a2d Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 23 Sep 2020 10:52:54 -0700 Subject: [PATCH 054/449] Warn if `import torch` is called from the source root. (#39995) Summary: This is a small developer quality of life improvement. I commonly try to run some snippet of python as I'm working on a PR and forget that I've cd-d into the local clone to run some git commands, resulting in annoying failures like: `ImportError: cannot import name 'default_generator' from 'torch._C' (unknown location)` This actually took a non-trivial amount of time to figure out the first time I hit it, and even now it's annoying because it happens just infrequently enough to not sit high in the mental cache. This PR adds a check to `torch/__init__.py` and warns if `import torch` is likely resolving to the wrong thing: ``` WARNING:root:You appear to be importing PyTorch from a clone of the git repo: /data/users/taylorrobie/repos/pytorch This will prevent `import torch` from resolving to the PyTorch install (instead it will try to load /data/users/taylorrobie/repos/pytorch/torch/__init__.py) and will generally lead to other failures such as a failure to load C extensions. ``` so that the soon to follow internal import failure makes some sense. I elected to make this a warning rather than an exception because I'm not 100% sure that it's **always** wrong. (e.g. weird `PYTHONPATH` or `importlib` corner cases.) EDIT: There are now separate cases for `cwd` vs. `PYTHONPATH`, and failure is an `ImportError`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/39995 Reviewed By: malfet Differential Revision: D23817209 Pulled By: robieta fbshipit-source-id: d9ac567acb22d9c8c567a8565a7af65ac624dbf7 --- torch/__init__.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/torch/__init__.py b/torch/__init__.py index 6523ab126c0d..da9eecad7df5 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -12,6 +12,7 @@ import os import sys import platform +import textwrap import ctypes if sys.version_info < (3,): @@ -193,6 +194,31 @@ def _load_global_deps(): if TYPE_CHECKING: import torch._C as _C +# Check to see if we can load C extensions, and if not provide some guidance +# on what the problem might be. +try: + # _initExtension is chosen (arbitrarily) as a sentinel. + from torch._C import _initExtension +except ImportError: + import torch._C as _C_for_compiled_check + + # The __file__ check only works for Python 3.7 and above. + if sys.version_info >= (3, 7) and _C_for_compiled_check.__file__ is None: + raise ImportError(textwrap.dedent(''' + Failed to load PyTorch C extensions: + It appears that PyTorch has loaded the `torch/_C` folder + of the PyTorch repository rather than the C extensions which + are expected in the `torch._C` namespace. This can occur when + using the `install` workflow. e.g. + $ python setup.py install && python -c "import torch" + + This error can generally be solved using the `develop` workflow + $ python setup.py develop && python -c "import torch" # This should succeed + or by running Python from a different directory. + ''').strip()) from None + raise # If __file__ is not None the cause is unknown, so just re-raise. + + __all__ += [name for name in dir(_C) if name[0] != '_' and not name.endswith('Base')] @@ -477,9 +503,9 @@ def manager_path(): # is not a good way to fix this problem. Perhaps, try to redesign VariableFunctions # so that this import is good enough if TYPE_CHECKING: - # Some type signatures pulled in from _VariableFunctions here clash with + # Some type signatures pulled in from _VariableFunctions here clash with # signatures already imported. For now these clashes are ignored; see - # PR #43339 for details. + # PR #43339 for details. from torch._C._VariableFunctions import * # type: ignore for name in dir(_C._VariableFunctions): From da4033d32adafd6fe0fa3a3727fcb2b55c19e2e9 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 23 Sep 2020 11:03:53 -0700 Subject: [PATCH 055/449] Make cudaHostRegister actually useful on cudart. (#45159) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45159 By default, pybind11 binds void* to be capsules. After a lot of Googling, I have concluded that this is not actually useful: you can't actually create a capsule from Python land, and our data_ptr() function returns an int, which means that the function is effectively unusable. It didn't help that we had no tests exercising it. I've replaced the void* with uintptr_t, so that we now accept int (and you can pass data_ptr() in directly). I'm not sure if we should make these functions accept ctypes types; unfortunately, pybind11 doesn't seem to have any easy way to do this. Fixes #43006 Also added cudaHostUnregister which was requested. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: lw Differential Revision: D23849731 Pulled By: ezyang fbshipit-source-id: 8a79986f3aa9546abbd2a6a5828329ae90fd298f --- test/test_cuda.py | 12 ++++++++++++ torch/csrc/cuda/shared/cudart.cpp | 7 ++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index 011e8c374645..2d23954cfcf8 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -279,6 +279,18 @@ def assert_change(comp=1, empty_cache=False, reset_peak=False): assert_change(0, empty_cache=True) assert_change(0, reset_peak=True) + @skipIfRocm + def test_cudart_register(self): + t = torch.ones(20) + self.assertFalse(t.is_pinned()) + cudart = torch.cuda.cudart() + r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0) + self.assertEquals(r, 0) + self.assertTrue(t.is_pinned()) + r = cudart.cudaHostUnregister(t.data_ptr()) + self.assertEquals(r, 0) + self.assertFalse(t.is_pinned()) + def test_memory_stats(self): gc.collect() torch.cuda.empty_cache() diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp index efada16a49c8..a8f80a35855d 100644 --- a/torch/csrc/cuda/shared/cudart.cpp +++ b/torch/csrc/cuda/shared/cudart.cpp @@ -29,7 +29,12 @@ void initCudartBindings(PyObject* module) { cudart.def("cuda" "GetErrorString", cudaGetErrorString); cudart.def("cuda" "ProfilerStart", cudaProfilerStart); cudart.def("cuda" "ProfilerStop", cudaProfilerStop); - cudart.def("cuda" "HostRegister", cudaHostRegister); + cudart.def("cuda" "HostRegister", [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t { + return cudaHostRegister((void*)ptr, size, flags); + }); + cudart.def("cuda" "HostUnregister", [](uintptr_t ptr) -> cudaError_t { + return cudaHostUnregister((void*)ptr); + }); #ifndef __HIP_PLATFORM_HCC__ cudart.def("cuda" "ProfilerInitialize", cudaProfilerInitialize); #endif From 4d80c8c64885eb383d3241bd3ff3d272e5be4cd1 Mon Sep 17 00:00:00 2001 From: Zino Benaissa Date: Wed, 23 Sep 2020 11:12:56 -0700 Subject: [PATCH 056/449] Fix inlining interface call in fork subgraph (#43790) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43790 Interface calls were not handled properly when they are used in fork subgraph. This PR fixes this issue. Test Plan: Imported from OSS Reviewed By: eellison Differential Revision: D23402039 Pulled By: bzinodev fbshipit-source-id: 41adc5ee7d942250e732e243ab30e356d78d9bf7 --- test/jit/test_freezing.py | 52 +++++++++++++++++++++++-- test/jit/test_module_interface.py | 52 +++++++++++++++++++++++++ torch/csrc/jit/passes/freeze_module.cpp | 37 ++++++++++-------- 3 files changed, 121 insertions(+), 20 deletions(-) diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 2d2c404051f6..4ec8f7e46d1b 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -237,8 +237,8 @@ def forward(self, x): def test_freeze_module_with_fork2(self): @torch.jit.script - def foo(x, y): - return x * y + def foo(x): + return x * 2 class TestModule(nn.Module): def __init__(self): @@ -247,8 +247,8 @@ def __init__(self): self.b = torch.ones(20, 20) def forward(self, x): - fut = torch.jit._fork(foo, self.a, self.b) - y_hat = foo(self.a, self.b) + fut = torch.jit._fork(foo, self.a) + y_hat = foo(self.b) y = torch.jit._wait(fut) return y_hat + y @@ -272,6 +272,50 @@ def forward(self, x): # conservatively assumes there is a mutation because attributes are # passed to fork subgraph. both 'a' and 'b' are preserved. self.assertTrue(mf.hasattr('a')) + self.assertFalse(mf.hasattr('b')) + output_f = mf.forward(input) + self.assertEqual(output_s, output_f) + + def test_freeze_module_with_fork_calling_module_method(self): + @torch.jit.script + def foo(x, y): + return x * y + + class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.a = torch.ones(20, 20) + self.b = torch.ones(20, 20) + + @torch.jit.export + def foo(self, x): + return x * self.a + + @torch.jit.export + def bar(self, x): + return x * self.b + + def forward(self, x): + fut = torch.jit._fork(self.foo, self.b) + y_hat = self.bar(self.a) + y = torch.jit._wait(fut) + return y_hat + y + + m = torch.jit.script(TestModule()) + m.eval() + input = torch.randn(2, 2) + output_s = m.forward(input) + mf = torch._C._freeze_module(m._c) + # Check if frozen module looks as below: + # module m { + # attributes { + # self.b = .. + # } + # ... + # TODO: Although there are no mutation, the alias analysis + # conservatively assumes there is a mutation because attributes are + # passed to fork subgraph. 'b' is preserved. + self.assertFalse(mf.hasattr('a')) self.assertTrue(mf.hasattr('b')) output_f = mf.forward(input) self.assertEqual(output_s, output_f) diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py index 963c1ede8323..f06dafbc1ba2 100644 --- a/test/jit/test_module_interface.py +++ b/test/jit/test_module_interface.py @@ -595,6 +595,58 @@ def forward(self, x): with self.assertRaisesRegex(RuntimeError, "failed to freeze interface attribute 'proxy_mod'"): mf = torch._C._freeze_module(m._c, freezeInterfaces = True) + def test_freeze_module_with_interface_and_fork(self): + class SubModule(torch.nn.Module): + def __init__(self): + super(SubModule, self).__init__() + self.b = torch.tensor([1.5]) + + def forward(self, x): + self.b[0] += 3.2 + return self.b + + class OrigMod(torch.nn.Module): + def __init__(self): + super(OrigMod, self).__init__() + self.a = torch.tensor([0.5]) + + def forward(self, x): + return self.a + + @torch.jit.interface + class ModInterface(torch.nn.Module): + def forward(self, x): + # type: (Tensor) -> Tensor + pass + + class TestModule(torch.nn.Module): + proxy_mod : ModInterface + + def __init__(self): + super(TestModule, self).__init__() + self.proxy_mod = OrigMod() + self.sub = SubModule() + + def forward(self, x): + y = self.proxy_mod(x); + z= self.sub(x) + return y + z + + class MainModule(torch.nn.Module): + def __init__(self): + super(MainModule, self).__init__() + self.test= TestModule(); + + def forward(self, x): + fut = torch.jit._fork(self.test.forward, x) + y = self.test(x) + z = torch.jit._wait(fut) + return y + z + + m = torch.jit.script(MainModule()) + m.eval() + mf = torch._C._freeze_module(m._c, freezeInterfaces = True) + def test_module_apis_interface(self): @torch.jit.interface class ModuleInterface(nn.Module): diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 4e95c9af40e3..bec7bf144201 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -97,12 +97,7 @@ class AttributePropagator { auto graph = function->graph(); optimizeSubGraphs(graph, applyInline); if (freezeInterfaces_) { - optimizeSubGraphs( - graph, - std::bind( - &AttributePropagator::inlineInterfaceCalls, - *this, - std::placeholders::_1)); + inlineInterfaceCalls(graph); } // Record Attributes that are explicitly set in the module. // They cannot be folded. @@ -379,6 +374,14 @@ class AttributePropagator { inlineInterfaceCall(n, attr); // Reset the GetAttr to concrete module type. n->output()->setType(attr.type()); + } else if (n->kind() == prim::fork) { + applyToForkSubgraph( + n, + graph, + std::bind( + &AttributePropagator::inlineInterfaceCalls, + *this, + std::placeholders::_1)); } } } @@ -476,18 +479,20 @@ class AttributePropagator { auto node = n->inputs()[0]->node(); // Check if first parameter of fork is a module. This module is used // as the base module (similar to 'self' in forward) to resolve GetAttrs. - if (node->kind() != prim::GetAttr) { - return; - } - auto name = node->s(attr::name); - auto input = node->inputs()[0]; - if (!findConstantAttr(input, name, attrModule, graph)) { - // Module needs to be preserved. - return; + // Otherwise freezing is applied using module_ + if (node->kind() == prim::GetAttr && + node->output()->type()->cast()) { + auto name = node->s(attr::name); + auto input = node->inputs()[0]; + if (!findConstantAttr(input, name, attrModule, graph)) { + // Module needs to be preserved. + return; + } + attrModule = attrModule.attr(name).toModule(); + std::swap(module_, attrModule); } - attrModule = attrModule.attr(name).toModule(); + auto subgraph = n->g(attr::Subgraph); - std::swap(module_, attrModule); func(subgraph); module_ = attrModule; } From 99242eca1db7ac50ae809f3dd57e3d5ae2b88284 Mon Sep 17 00:00:00 2001 From: Tim Nieradzik Date: Wed, 23 Sep 2020 11:36:33 -0700 Subject: [PATCH 057/449] Dockerfile: Support CUDA 11 (#45071) Summary: Although PyTorch already supports CUDA 11, the Dockerfile still relies on CUDA 10. This pull request upgrades all the necessary versions such that recent NVIDIA GPUs like A100 can be used. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45071 Reviewed By: ezyang Differential Revision: D23873224 Pulled By: seemethere fbshipit-source-id: 822c25f183dcc3b4c5b780c00cd37744d34c6e00 --- Dockerfile | 4 ++-- docker.Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index d5619e1a8011..5bae3ec14ea6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,13 +44,13 @@ WORKDIR /opt/pytorch COPY --from=conda /opt/conda /opt/conda COPY --from=submodule-update /opt/pytorch /opt/pytorch RUN --mount=type=cache,target=/opt/ccache \ - TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ python setup.py install FROM conda as conda-installs ARG INSTALL_CHANNEL=pytorch-nightly -RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=10.1 && \ +RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \ /opt/conda/bin/conda clean -ya FROM ${BASE_IMAGE} as official diff --git a/docker.Makefile b/docker.Makefile index ba53b94d7898..18acced1de8d 100644 --- a/docker.Makefile +++ b/docker.Makefile @@ -9,7 +9,7 @@ DOCKER_ORG = $(shell whoami) endif BASE_RUNTIME = ubuntu:18.04 -BASE_DEVEL = nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 +BASE_DEVEL = nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 # The conda channel to use to install pytorch / torchvision INSTALL_CHANNEL = pytorch From 21fabae47a44a95b7840266a31c89dab1731ef6c Mon Sep 17 00:00:00 2001 From: Bradley Davis Date: Wed, 23 Sep 2020 13:50:26 -0700 Subject: [PATCH 058/449] Remove expensive call to PyObject_GetAttrString in PyTorch_LookupSpecial (#44684) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44684 The ad-hoc quantization benchmarking script in D23689062 recently highlighted that quantized ops were surprisingly slow after the introduction of support for custom ops in torch.fx in D23203204 (https://github.com/pytorch/pytorch/commit/f15e27265ff76f49844b0ccc6ca387cb564824bf). Using strobelight, it's immediately clear that up to 66% of samples were seen in `c10::get_backtrace`, which is descends from `torch::is_tensor_and_apppend_overloaded -> torch::check_has_torch_function -> torch::PyTorch_LookupSpecial -> PyObject_HasAttrString -> PyObject_GetAttrString`. I'm no expert by any means so please correct any/all misinterpretation, but it appears that: - `check_has_torch_function` only needs to return a bool - `PyTorch_LookupSpecial` should return `NULL` if a matching method is not found on the object - in the impl of `PyTorch_LookupSpecial` the return value from `PyObject_HasAttrString` only serves as a bool to return early, but ultimately ends up invoking `PyObject_GetAttrString`, which raises, spawning the generation of a backtrace - `PyObject_FastGetAttrString` returns `NULL` (stolen ref to an empty py::object if the if/else if isn't hit) if the method is not found, anyway, so it could be used singularly instead of invoking both `GetAttrString` and `FastGetAttrString` - D23203204 (https://github.com/pytorch/pytorch/commit/f15e27265ff76f49844b0ccc6ca387cb564824bf) compounded (but maybe not directly caused) the problem by increasing the number of invocations so, removing it in this diff and seeing how many things break :) before: strobelight: see internal section output from D23689062 script: ``` $ ./buck-out/gen/scripts/v/test_pt_quant_perf.par Sequential( (0): Quantize(scale=tensor([0.0241]), zero_point=tensor([60]), dtype=torch.quint8) (1): QuantizedLinear(in_features=4, out_features=4, scale=0.017489388585090637, zero_point=68, qscheme=torch.per_tensor_affine) (2): DeQuantize() ) fp 0.010896682739257812 q 0.11908197402954102 ``` after: strobelight: see internal section output from D23689062 script: ``` $ ./buck-out/gen/scripts/v/test_pt_quant_perf.par Sequential( (0): Quantize(scale=tensor([0.0247]), zero_point=tensor([46]), dtype=torch.quint8) (1): QuantizedLinear(in_features=4, out_features=4, scale=0.012683945707976818, zero_point=41, qscheme=torch.per_tensor_affine) (2): DeQuantize() ) fp 0.011141300201416016 q 0.022639036178588867 ``` which roughly restores original performance seen in P142370729 UPDATE: 9/22 mode/opt benchmarks ``` buck run //scripts/x:test_pt_quant_perf mode/opt Sequential( (0): Quantize(scale=tensor([0.0263]), zero_point=tensor([82]), dtype=torch.quint8) (1): QuantizedLinear(in_features=4, out_features=4, scale=0.021224206313490868, zero_point=50, qscheme=torch.per_tensor_affine) (2): DeQuantize() ) fp 0.002968311309814453 q 0.5138928890228271 ``` with patch: ``` buck run //scripts/x:test_pt_quant_perf mode/opt Sequential( (0): Quantize(scale=tensor([0.0323]), zero_point=tensor([70]), dtype=torch.quint8) (1): QuantizedLinear(in_features=4, out_features=4, scale=0.017184294760227203, zero_point=61, qscheme=torch.per_tensor_affine) (2): DeQuantize() ) fp 0.0026655197143554688 q 0.0064449310302734375 ``` Reviewed By: ezyang Differential Revision: D23697334 fbshipit-source-id: f756d744688615e01c94bf5c48c425747458fb33 --- torch/csrc/utils/python_arg_parser.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index a641fbda2013..78efb6cf2db3 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -712,9 +712,6 @@ static py::object PyTorch_LookupSpecial(PyObject *obj, char* name) if (_is_basic_python_type(tp)) { return py::object(); } - if(PyObject_HasAttrString(obj, name) == 0){ - return py::object(); - } return PyObject_FastGetAttrString((PyObject *)tp, name); } From adb2b380baf1d78a5e4a48d8a6999b94aaeff403 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Sep 2020 13:53:52 -0700 Subject: [PATCH 059/449] [quant][graphmode][fx] qconfig_dict support more types of configurations (#44856) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44856 Support following format of qconfig_dict ```python qconfig_dict = { # optional, global config "": qconfig?, # optional, used for module and function types # could also be split into module_types and function_types if we prefer "object_type": [ (nn.Conv2d, qconfig?), (F.add, qconfig?), ..., ], # optional, used for module names "module_name": [ ("foo.bar", qconfig?) ..., ], # optional, matched in order, first match takes precedence "module_name_regex": [ ("foo.*bar.*conv[0-9]+", qconfig?) ..., ] # priority (in increasing order): global, object_type, module_name_regex, module_name # qconfig == None means fusion and quantization should be skipped for anything # matching the rule } ``` Test Plan: Imported from OSS Reviewed By: vkuzo Differential Revision: D23751304 fbshipit-source-id: 5b98f4f823502b12ae2150c93019c7b229c49c50 --- test/quantization/test_quantize_fx.py | 126 +++++++++++++++++++++++++- torch/quantization/fx/quantize.py | 120 ++++++++++++++++++++++-- torch/quantization/quantize_fx.py | 38 ++++++-- 3 files changed, 263 insertions(+), 21 deletions(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index fc4a735854ef..c1641ae3e194 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -61,6 +61,7 @@ import operator import unittest +@skipIfNoFBGEMM class TestQuantizeFx(QuantizationTestCase): def _get_conv_linear_test_cases(self): ''' Returns a list of test cases, with format: @@ -334,7 +335,8 @@ def forward(self, x): m = M().eval() m = symbolic_trace(m) - qconfig_dict = {'': default_qconfig, 'conv2': None} + qconfig_dict = {"": default_qconfig, + "module_name": [("conv2", None)]} m = prepare_static_fx(m, qconfig_dict) data = torch.randn(1, 1, 1, 1) m(data) @@ -344,11 +346,131 @@ def forward(self, x): node_list = [ ns.call_function(torch.quantize_per_tensor), ns.call_module(nnq.Conv2d), - ns.call_method('dequantize'), + ns.call_method("dequantize"), ns.call_module(nn.Conv2d), ] self.checkGraphModuleNodes(m, expected_node_list=node_list) + def test_qconfig_module_type(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1) + self.conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]} + m = prepare_static_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data) + m = convert_static_fx(m) + m(data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_module(nnq.Conv2d), + ns.call_module(nnq.Conv2d), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_function(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + + def forward(self, x, y): + return x + y + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"object_type": [(operator.add, default_qconfig)]} + m = prepare_static_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data, data) + m = convert_static_fx(m) + m(data, data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_function(torch.ops.quantized.add), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_module_name_regex(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1) + self.conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]} + m = prepare_static_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data) + m = convert_static_fx(m) + m(data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_module(nnq.Conv2d), + ns.call_module(nnq.Conv2d), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_precedence(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.linear = nn.Linear(1, 1) + self.conv = nn.Conv2d(1, 1, 1) + self.module_conv1 = nn.Conv2d(1, 1, 1) + self.module_conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + # global + x = self.linear(x) + # global + object_type --> object_type + x = self.conv(x) + # global + object_type + module_name_regex --> module_name_regex + x = self.module_conv1(x) + # global + object_type + module_name_regex + module_name --> module_name + x = self.module_conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + global_qconfig = default_qconfig + object_type_qconfig = default_dynamic_qconfig + module_name_regex_qconfig = float16_dynamic_qconfig + module_name_qconfig = default_qat_qconfig + qconfig_dict = { + "": global_qconfig, + "object_type": [(nn.Conv2d, object_type_qconfig)], + "module_name_regex": [("module_conv*", module_name_regex_qconfig)], + "module_name": [("module_conv2", module_name_qconfig)]} + m = prepare_static_fx(m, qconfig_dict) + self.assertEqual(m.linear.qconfig, global_qconfig) + self.assertEqual(m.conv.qconfig, object_type_qconfig) + self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig) + self.assertEqual(m.module_conv2.qconfig, module_name_qconfig) + + def test_remove_qconfig(self): class M(torch.nn.Module): def __init__(self): diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 6254120999f0..67e538b40433 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -40,7 +40,9 @@ quantize_node, ) +from collections import OrderedDict import copy +import re # ------------------------ # Helper Functions @@ -136,6 +138,54 @@ def is_submodule_of_fake_quant(name, module, named_modules): parent_name, _ = _parent_name(name) return is_activation_post_process(named_modules[parent_name]) +def get_flattened_qconfig_dict(qconfig_dict): + """ flatten the global, object_type and module_name qconfig + to the same qconfig_dict so that it can be used by + propagate_qconfig_ function. + "module_name_regex" is ignored for now since it's not supported + in propagate_qconfig_, but it can be fixed later. + + For example: + Input: { + "": qconfig, + "object_type": [ + (torch.add, qconfig) + ], + "module_name": [ + ("conv", qconfig) + ] + } + + Output: { + "": qconfig, + torch.add: qconfig, + "conv": qconfig + } + """ + flattened = dict() + if '' in qconfig_dict: + flattened[''] = qconfig_dict[''] + + def flatten_key(key): + if key in qconfig_dict: + for obj, qconfig in qconfig_dict[key]: + flattened[obj] = qconfig + + flatten_key('object_type') + flatten_key('module_name') + return flattened + +def convert_dict_to_ordered_dict(qconfig_dict): + """ Convert dict in qconfig_dict to ordered dict + """ + # convert a qconfig list for a type to OrderedDict + def _convert_to_ordered_dict(key, qconfig_dict): + qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, [])) + + _convert_to_ordered_dict('object_type', qconfig_dict) + _convert_to_ordered_dict('module_name_regex', qconfig_dict) + _convert_to_ordered_dict('module_name', qconfig_dict) + # A dictionary for querying the weight index for a given op WEIGHT_INDEX_DICT = { torch.nn.functional.conv2d : [1], @@ -181,23 +231,72 @@ def __init__(self): def _qat_swap_modules(self, root): convert(root, mapping=get_qat_module_mappings(), inplace=True, remove_qconfig=False) - def _generate_qconfig_map(self, root, input_graph): - def get_qconfig(module): - return module.qconfig if hasattr(module, 'qconfig') else None + def _generate_qconfig_map(self, + root, + input_graph, + qconfig_dict): + global_qconfig = qconfig_dict.get('', None) + + def get_module_type_qconfig( + module_type, fallback_qconfig=global_qconfig): + return qconfig_dict['object_type'].get(module_type, fallback_qconfig) + + def get_function_qconfig( + function, fallback_qconfig=global_qconfig): + return qconfig_dict['object_type'].get(function, fallback_qconfig) + + def get_module_name_regex_qconfig( + module_name, fallback_qconfig=global_qconfig): + for regex_pattern, qconfig in qconfig_dict['module_name_regex'].items(): + if re.match(regex_pattern, module_name): + # first match wins + return qconfig + return fallback_qconfig + + def get_module_name_qconfig( + module_name, fallback_qconfig=global_qconfig): + if module_name == '': + # module name qconfig not found + return fallback_qconfig + if module_name in qconfig_dict['module_name']: + return qconfig_dict['module_name'][module_name] + else: + parent, _ = _parent_name(module_name) + return get_module_name_qconfig(parent, fallback_qconfig) + + # get qconfig for module_name, + # fallback to module_name_regex_qconfig, module_type_qconfig, global_qconfig + # if necessary + def get_qconfig(module_name): + module_type_qconfig = \ + get_module_type_qconfig(type(self.modules[module_name])) + module_name_regex_qconfig = \ + get_module_name_regex_qconfig(module_name, module_type_qconfig) + module_name_qconfig = \ + get_module_name_qconfig(module_name, module_name_regex_qconfig) + return module_name_qconfig self.qconfig_map = dict() for node in input_graph.nodes: if node.op == 'get_attr': - parent, _ = _parent_name(node.target) - self.qconfig_map[node.name] = get_qconfig(self.modules[parent]) + module_name, _ = _parent_name(node.target) + self.qconfig_map[node.name] = get_qconfig(module_name) elif node.op == 'call_function': - self.qconfig_map[node.name] = get_qconfig(root) + # precedence: [TODO] module_name_qconfig (need scope support from fx) + # > function_qconfig > global_qconfig + function_qconfig = get_function_qconfig(node.target) + self.qconfig_map[node.name] = function_qconfig elif node.op == 'call_method': self_obj = node.args[0] # qconfig for call_method should be the same as the `self` object for the call self.qconfig_map[node.name] = self.qconfig_map[self_obj.name] elif node.op == 'call_module': - self.qconfig_map[node.name] = get_qconfig(self.modules[node.target]) + module_qconfig = get_qconfig(node.target) + # regex is not supported eager mode propagate_qconfig_, we'll need to + # set the qconfig explicitly here in case regex + # is used + self.modules[node.target].qconfig = module_qconfig + self.qconfig_map[node.name] = module_qconfig def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant): if not inplace: @@ -208,14 +307,17 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant): else: self.patterns = get_quant_patterns() - propagate_qconfig_(model, qconfig_dict) + flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict) + # TODO: support regex as well + propagate_qconfig_(model, flattened_qconfig_dict) if model.training: self._qat_swap_modules(model) self.modules = dict(model.named_modules()) + convert_dict_to_ordered_dict(qconfig_dict) # map from node name to qconfig, used in _find_matches - self._generate_qconfig_map(model, model.graph) + self._generate_qconfig_map(model, model.graph, qconfig_dict) # match the patterns that will get quantized matches = self._find_matches(model.graph, self.modules, self.patterns) diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py index 77178552ee71..0f68f2e0e9e9 100644 --- a/torch/quantization/quantize_fx.py +++ b/torch/quantization/quantize_fx.py @@ -115,16 +115,34 @@ def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, deb Args: `model`: input float TorchScript model - `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and - qconfig for that module as value, empty key means the qconfig will be applied - to whole model unless it’s overwritten by more specific configurations, the - qconfig for each module is either found in the dictionary or fallback to - the qconfig of parent module. - - Right now qconfig_dict is the only way to configure how the model is quantized, - and it is done in the granularity of module, that is, we only support one type - of qconfig for each torch.nn.Module, and the qconfig for sub module will - override the qconfig for parent module, empty string means global configuration. + `qconfig_dict`: qconfig_dict is a dictionary with the following configurations: + qconfig_dict = { + # optional, global config + "": qconfig?, + + # optional, used for module and function types + # could also be split into module_types and function_types if we prefer + "object_type": [ + (torch.nn.Conv2d, qconfig?), + (torch.nn.functional.add, qconfig?), + ..., + ], + + # optional, used for module names + "module_name": [ + ("foo.bar", qconfig?) + ..., + ], + + # optional, matched in order, first match takes precedence + "module_name_regex": [ + ("foo.*bar.*conv[0-9]+", qconfig?) + ..., + ] + # priority (in increasing order): global, object_type, module_name_regex, module_name + # qconfig == None means fusion and quantization should be skipped for anything + # matching the rule + } `run_fn`: a calibration function for calibrating the prepared model `run_args`: positional arguments for `run_fn` `inplace`: carry out model transformations in-place, the original module is From 9e206ee9f1b287d95970abc4a1bcd1756527d012 Mon Sep 17 00:00:00 2001 From: Nick Gibson Date: Wed, 23 Sep 2020 13:55:01 -0700 Subject: [PATCH 060/449] [NNC] Fix a bug in SplitWithMask when splitting multiple times (#45141) Summary: When doing a splitWithMask we only mask if the loop extent is not cleanly divide by the split factor. However, the logic does not simplify so any nontrivial loop extents will always cause a mask to be added, e.g. if the loop had been previously split. Unlike splitWithTail, the masks added by splitWithMask are always overhead and we don't have the analysis to optimize them out if they are unnecessary, so it's good to avoid inserting them if we can. The fix is just to simplify the loop extents before doing the extent calculation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45141 Reviewed By: ezyang Differential Revision: D23869170 Pulled By: nickgg fbshipit-source-id: 44686fd7b802965ca4f5097b0172a41cf837a1f5 --- test/cpp/tensorexpr/test_loopnest.cpp | 34 ++++++++++++++++++++++++++ test/cpp/tensorexpr/tests.h | 1 + torch/csrc/jit/tensorexpr/loopnest.cpp | 9 ++++--- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 602eb116e7b9..201a7e57820b 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -609,6 +609,40 @@ void testExprSplitWithMask01() { ExpectAllNear(c_v, c_ref, 1e-5); } +// Tests the case where we split a loop cleanly multiple times, we should not +// insert any masks. +void testExprSplitWithMaskRepeatedNoMask() { + KernelScope kernel_scope; + const int M = 64; + Buffer a_buf("a", kFloat, {M}); + Buffer b_buf("b", kFloat, {M}); + Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + return a_buf(m) + b_buf(m) + 1.0f; + }); + + LoopNest l({tensor}); + std::vector loops = l.getLoopStmtsFor(tensor); + For *outer, *mid, *inner; + l.splitWithMask(loops[0], 4, &outer, &inner); + l.splitWithMask(outer, 4, &outer, &mid); + + Stmt* stmt1 = IRSimplifier::simplify(l.root_stmt()); + std::ostringstream oss; + oss << *stmt1; + + // Two splits mean 3 loops, but should need no masks in this case. + const std::string& verification_pattern = + R"IR( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: f[)IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); +} + void testSplitWithTailWithLoopOptions() { KernelScope kernel_scope; const int M = 21; diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index 56831c8db663..d0a9aa840b91 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -55,6 +55,7 @@ namespace jit { _(ExprSplitWithTail) \ _(ExprSplitWithTailNone) \ _(ExprSplitWithMask01) \ + _(ExprSplitWithMaskRepeatedNoMask) \ _(SplitWithTailWithLoopOptions) \ _(SplitWithMaskWithLoopOptions) \ _(ScheduleBroadcastAddBuffer) \ diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index f80e4585b790..b7862fb953c1 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -1007,10 +1007,11 @@ void LoopNest::splitWithMask(For* f, int factor, For** outer, For** inner) { } bool tail_is_needed = true; - if (dynamic_cast(f->start()) && - dynamic_cast(f->stop())) { - int start_val = dynamic_cast(f->start())->value(); - int stop_val = dynamic_cast(f->stop())->value(); + const Expr* start = IRSimplifier::simplify(f->start()); + const Expr* stop = IRSimplifier::simplify(f->stop()); + if (start->isConstant() && stop->isConstant()) { + int start_val = immediateAs(start); + int stop_val = immediateAs(stop); int size_val = stop_val - start_val; int tail_size = size_val % factor; if (tail_size == 0) { From 3f89b779c4152cec48a9ed2baa704cbc183e8afc Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Wed, 23 Sep 2020 14:01:43 -0700 Subject: [PATCH 061/449] [jit] allow submodule methods inference rule be different (#43872) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43872 This PR allows the recursive scripting to have a separate submodule_stubs_fn to create its submodule with specific user provided rules. Fixes https://github.com/pytorch/pytorch/issues/43729 Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D23430176 Pulled By: wanchaol fbshipit-source-id: 20530d7891ac3345b36f1ed813dc9c650b28d27a --- test/jit/test_tracer.py | 33 +++++++++++++++++++++++++++++++++ torch/jit/_recursive.py | 17 +++++++++++++++-- torch/jit/_script.py | 7 +++++-- torch/jit/_trace.py | 21 ++++----------------- 4 files changed, 57 insertions(+), 21 deletions(-) diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 22921f7d684a..518af2f95a4c 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -1310,6 +1310,39 @@ def check(mod): imported = self.getExportImportCopy(traced) check(imported.foo) + # Note that Bar's forward can only be traced, but not scripted + class Bar(nn.Module): + def __init__(self): + super().__init__() + + @torch.jit.export + def addTwo(self, x): + return x + 2 + + def forward(self, input): + return (lambda a: a + 1)(input) + + # When tracing Bar as a submodule, we only want to script the + # exported methods, and we want to keep the forwards still + # being traced. + class WrapperExports(torch.nn.Module): + def __init__(self): + super(WrapperExports, self).__init__() + self.bar = Bar() + + @torch.jit.export + def addOne(self, x): + return x + 1 + + def forward(self, x): + return self.bar(x) + + f = WrapperExports() + + traced = torch.jit.trace(f, (torch.rand(3, 4),)) + expected_names = ['addOne'] + check(traced) + def test_trace_autograd_function(self): class TestFunc(torch.autograd.Function): @staticmethod diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py index 85853cd1b1ee..0eb423516f6f 100644 --- a/torch/jit/_recursive.py +++ b/torch/jit/_recursive.py @@ -52,6 +52,19 @@ def make_stub_from_method(nn_module, method_name): return make_stub(func, method_name) +def make_stubs_from_exported_methods(mod): + stubs = [] + for name in dir(mod): + item = getattr(mod, name, None) + if ( + _jit_internal.get_torchscript_modifier(item) + is _jit_internal.FunctionModifiers.EXPORT + ): + stubs.append(make_stub_from_method(mod, name)) + + return stubs + + # base types that can be constants # in addition, tuples and lists of these base types are also considered constants # If you edit this list, then you also need to edit the handlers in @@ -371,8 +384,8 @@ def init_fn(script_module): elif isinstance(orig_value, torch.jit.ScriptModule): scripted = orig_value else: - # use the default recursive rule to compile the module - scripted = create_script_module_impl(orig_value, sub_concrete_type, infer_methods_to_compile) + # always reuse the provided stubs_fn to infer the methods to compile + scripted = create_script_module_impl(orig_value, sub_concrete_type, stubs_fn) cpp_module.setattr(name, scripted) script_module._modules[name] = scripted diff --git a/torch/jit/_script.py b/torch/jit/_script.py index fb0465288e3f..4d28a5f2ad13 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -18,7 +18,7 @@ import torch import torch._jit_internal as _jit_internal from torch.utils import set_module -from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module +from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module, infer_methods_to_compile from torch.nn import Module from torch.jit._state import _enabled from torch.jit._builtins import _register_builtin @@ -200,7 +200,10 @@ def init_then_script(self, *args, **kwargs): def make_stubs(module): cls = type(module) - return [v for k, v in sorted(cls._methods.items())] + if hasattr(cls, "_methods"): + return [v for k, v in sorted(cls._methods.items())] + else: + return infer_methods_to_compile(module) self.__dict__[ "_actual_script_module" diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py index 3b312c7e2161..e73785e15aea 100644 --- a/torch/jit/_trace.py +++ b/torch/jit/_trace.py @@ -22,7 +22,6 @@ from torch.jit._script import ScriptModule, _CachedForward, script from torch._jit_internal import _qualified_name from torch.autograd import function -from torch import _jit_internal from torch.nn import Module _flatten = torch._C._jit_flatten @@ -549,23 +548,11 @@ def make_module(mod, _module_class, _compilation_unit): return mod elif torch._jit_internal.module_has_exports(mod): - def make_stubs_from_exported_methods(mod): - exported = [] - for name in dir(mod): - item = getattr(mod, name, None) - if ( - torch._jit_internal.get_torchscript_modifier(item) - is _jit_internal.FunctionModifiers.EXPORT - ): - exported.append(name) - - stubs = [] - for method in exported: - stubs.append(torch.jit._recursive.make_stub_from_method(mod, method)) - return stubs - + infer_methods_stubs_fn = torch.jit._recursive.make_stubs_from_exported_methods return torch.jit._recursive.create_script_module( - mod, make_stubs_from_exported_methods, share_types=False + mod, + infer_methods_stubs_fn, + share_types=False ) else: if _module_class is None: From d2b045030eb60283b8aeeb2956c7ebe91628fece Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 23 Sep 2020 14:26:03 -0700 Subject: [PATCH 062/449] gtest-ify JIT tests, through the letter c (#45020) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45020 See https://github.com/pytorch/pytorch/pull/45018 for context. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D23802296 Pulled By: suo fbshipit-source-id: 20c9798a414e9ba30869a862012cbdee0613c8b1 --- test/cpp/jit/test_autodiff.cpp | 9 +- test/cpp/jit/test_class_import.cpp | 12 +- test/cpp/jit/test_class_parser.cpp | 4 +- test/cpp/jit/test_cleanup_passes.cpp | 37 +- test/cpp/jit/test_code_template.cpp | 50 ++- test/cpp/jit/test_constant_pooling.cpp | 87 ++--- .../jit/test_create_autodiff_subgraphs.cpp | 5 +- test/cpp/jit/test_custom_class.cpp | 4 +- test/cpp/jit/test_custom_operators.cpp | 342 +++++++++--------- test/cpp/jit/test_misc.cpp | 10 + test/cpp/jit/tests.h | 16 - 11 files changed, 282 insertions(+), 294 deletions(-) diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp index 7d431776a971..3993c63b1708 100644 --- a/test/cpp/jit/test_autodiff.cpp +++ b/test/cpp/jit/test_autodiff.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/tracer.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" @@ -83,7 +84,7 @@ variable_list grad( fmap(inputs, get_edge)); } -void testADFormulas() { +TEST(AutodiffTest, ADFormulas) { const auto cast = [](const Variable& v) { return static_cast(v); }; @@ -174,7 +175,7 @@ void testADFormulas() { } } -void testDifferentiate() { +TEST(AutodiffTest, Differentiate) { // Note: can't use IRParser for this test due to issue #23989 auto graph = std::make_shared(); std::vector sizes{2, 3, 4}; @@ -229,7 +230,7 @@ void testDifferentiate() { ->run(*grad_spec.df); } -void testDifferentiateWithRequiresGrad() { +TEST(AutodiffTest, DifferentiateWithRequiresGrad) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp index 82bc0cf3bccc..ffa845b3e2a8 100644 --- a/test/cpp/jit/test_class_import.cpp +++ b/test/cpp/jit/test_class_import.cpp @@ -1,7 +1,7 @@ -#include -#include +#include #include +#include #include #include #include @@ -45,7 +45,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -void testClassImport() { +TEST(ClassImportTest, Basic) { auto cu1 = std::make_shared(); auto cu2 = std::make_shared(); std::vector constantTable; @@ -80,7 +80,7 @@ void testClassImport() { ASSERT_FALSE(c); } -void testScriptObject() { +TEST(ClassImportTest, ScriptObject) { Module m1("m1"); Module m2("m2"); std::vector constantTable; @@ -114,7 +114,7 @@ def __init__(self, x): return x )JIT"; -void testClassDerive() { +TEST(ClassImportTest, ClassDerive) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu); const auto self = SimpleSelf(cls); @@ -142,7 +142,7 @@ class FooBar1234(Module): return (self.f).top() )JIT"; -void testSaveLoadTorchbind() { +TEST(ClassImportTest, CustomClass) { auto cu1 = std::make_shared(); std::vector constantTable; // Import different versions of FooTest into two namespaces. diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp index 45e37103bb5a..a5b19f63fd3f 100644 --- a/test/cpp/jit/test_class_parser.cpp +++ b/test/cpp/jit/test_class_parser.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -15,7 +17,7 @@ const auto testSource = R"JIT( an_attribute : Tensor )JIT"; -void testClassParser() { +TEST(ClassParserTest, Basic) { Parser p(std::make_shared(testSource)); std::vector definitions; std::vector resolvers; diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp index 2f2ca4e0a19b..38ceef932eb0 100644 --- a/test/cpp/jit/test_cleanup_passes.cpp +++ b/test/cpp/jit/test_cleanup_passes.cpp @@ -1,19 +1,19 @@ +#include + #include #include #include #include -#include "test/cpp/jit/test_base.h" namespace torch { namespace jit { -void testCleanUpPasses() { +TEST(CleanupPassTest, Basic) { // Tests stability of clean up passes when dealing with constant pooling // and constant propagation. - { - auto graph = std::make_shared(); - parseIR( - R"IR( + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond.1 : Tensor, %suffix.1 : str): %3 : bool = aten::Bool(%cond.1) # o.py:6:7 @@ -31,20 +31,19 @@ graph(%cond.1 : Tensor, -> (%12) return (%25) )IR", - &*graph); - runCleanupPasses(graph); - testing::FileCheck() - .check_count( - "prim::Constant[value=\"same string with a twist\"]", - 1, - /*exactly=*/true) - ->run(*graph); + &*graph); + runCleanupPasses(graph); + testing::FileCheck() + .check_count( + "prim::Constant[value=\"same string with a twist\"]", + 1, + /*exactly=*/true) + ->run(*graph); - auto graph_after_pass_once = graph->toString(); - runCleanupPasses(graph); - auto graph_after_pass_twice = graph->toString(); - ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); - } + auto graph_after_pass_once = graph->toString(); + runCleanupPasses(graph); + auto graph_after_pass_twice = graph->toString(); + ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp index e4d7d1ef856e..35897474f1f2 100644 --- a/test/cpp/jit/test_code_template.cpp +++ b/test/cpp/jit/test_code_template.cpp @@ -1,6 +1,6 @@ -#include "test/cpp/jit/test_base.h" -#include "test/cpp/jit/test_utils.h" +#include +#include #include "torch/csrc/jit/frontend/code_template.h" namespace torch { @@ -33,31 +33,29 @@ static const auto ct_expect = R"( int notest(int a) )"; -void testCodeTemplate() { - { - TemplateEnv e; - e.s("hi", "foo"); - e.v("what", {"is", "this"}); - TemplateEnv c(e); - c.s("hi", "foo2"); - ASSERT_EQ(e.s("hi"), "foo"); - ASSERT_EQ(c.s("hi"), "foo2"); - ASSERT_EQ(e.v("what")[0], "is"); - } +TEST(TestCodeTemplate, Copying) { + TemplateEnv e; + e.s("hi", "foo"); + e.v("what", {"is", "this"}); + TemplateEnv c(e); + c.s("hi", "foo2"); + ASSERT_EQ(e.s("hi"), "foo"); + ASSERT_EQ(c.s("hi"), "foo2"); + ASSERT_EQ(e.v("what")[0], "is"); +} - { - TemplateEnv e; - e.v("args", {"hi", "8"}); - e.v("bar", {"what\non many\nlines...", "7"}); - e.s("a", "3"); - e.s("b", "4"); - e.v("stuff", {"things...", "others"}); - e.v("empty", {}); - auto s = ct.format(e); - // std::cout << "'" << s << "'\n"; - // std::cout << "'" << ct_expect << "'\n"; - ASSERT_EQ(s, ct_expect); - } +TEST(TestCodeTemplate, Formatting) { + TemplateEnv e; + e.v("args", {"hi", "8"}); + e.v("bar", {"what\non many\nlines...", "7"}); + e.s("a", "3"); + e.s("b", "4"); + e.v("stuff", {"things...", "others"}); + e.v("empty", {}); + auto s = ct.format(e); + // std::cout << "'" << s << "'\n"; + // std::cout << "'" << ct_expect << "'\n"; + ASSERT_EQ(s, ct_expect); } } // namespace jit diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp index b949c9a45b25..c8cb58e1886a 100644 --- a/test/cpp/jit/test_constant_pooling.cpp +++ b/test/cpp/jit/test_constant_pooling.cpp @@ -1,9 +1,10 @@ +#include + #include #include #include #include #include -#include "test/cpp/jit/test_base.h" #include #include @@ -11,26 +12,26 @@ namespace torch { namespace jit { -void testConstantPooling() { - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(ConstantPoolingTest, Int) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %8 : int = prim::Constant[value=1]() %10 : int = prim::Constant[value=1]() return (%8, %10) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingAcrossBlocks) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond : Tensor): %a : str = prim::Constant[value="bcd"]() %3 : bool = aten::Bool(%cond) @@ -44,17 +45,18 @@ graph(%cond : Tensor): %7 : (str, str) = prim::TupleConstruct(%a, %b) return (%7) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) - ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) + ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingDifferentDevices) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %2 : int = prim::Constant[value=2]() %1 : int = prim::Constant[value=1]() @@ -70,22 +72,21 @@ graph(): prim::Print(%x, %y, %z) return (%1) )IR", - &*graph); - // three tensors created - two different devices among the three - // don't have good support for parsing tensor constants - ConstantPropagation(graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count( - "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->check_count( - "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->run(*graph); - } + &*graph); + // three tensors created - two different devices among the three + // don't have good support for parsing tensor constants + ConstantPropagation(graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count( + "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->check_count( + "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->run(*graph); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp index 8da6d9d6a1b2..e97043f84d24 100644 --- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp +++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" @@ -6,7 +7,7 @@ namespace torch { namespace jit { -void testCreateAutodiffSubgraphs() { +TEST(CreateAutodiffSubgraphsTest, Basic) { auto graph = build_lstm(); CreateAutodiffSubgraphs(graph, /*threshold=*/2); // all of the ops are within the DifferentiableGraph diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp index 543fbc20eb3d..25c518d3142c 100644 --- a/test/cpp/jit/test_custom_class.cpp +++ b/test/cpp/jit/test_custom_class.cpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -318,7 +320,7 @@ TORCH_LIBRARY(_TorchScriptTesting, m) { } // namespace -void testTorchbindIValueAPI() { +TEST(CustomClassTest, TorchbindIValueAPI) { script::Module m("m"); // test make_custom_class API diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp index 529b36385bd4..d3f61268e8f1 100644 --- a/test/cpp/jit/test_custom_operators.cpp +++ b/test/cpp/jit/test_custom_operators.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/alias_analysis.h" @@ -11,134 +12,135 @@ namespace torch { namespace jit { -void testCustomOperators() { - { - torch::RegisterOperators reg( - "foo::bar", [](double a, at::Tensor b) { return a + b; }); - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, InferredSchema) { + torch::RegisterOperators reg( + "foo::bar", [](double a, at::Tensor b) { return a + b; }); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); + ASSERT_EQ(ops.size(), 1); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar"); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar"); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - torch::RegisterOperators reg( - "foo::bar_with_schema(float a, Tensor b) -> Tensor", - [](double a, at::Tensor b) { return a + b; }); + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = - getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ExplicitSchema) { + torch::RegisterOperators reg( + "foo::bar_with_schema(float a, Tensor b) -> Tensor", + [](double a, at::Tensor b) { return a + b; }); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); + auto& ops = + getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - // Check that lists work well. - torch::RegisterOperators reg( - "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", - [](torch::List ints, - torch::List floats, - torch::List tensors) { return floats; }); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists"); - - ASSERT_EQ(op->schema().arguments().size(), 3); - ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); - ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); - ASSERT_TRUE( - op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); - ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); - - Stack stack; - push(stack, c10::List({1, 2})); - push(stack, c10::List({1.0, 2.0})); - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); - - ASSERT_EQ(output.size(), 2); - ASSERT_EQ(output.get(0), 1.0); - ASSERT_EQ(output.get(1), 2.0); - } - { - torch::RegisterOperators reg( - "foo::lists2(Tensor[] tensors) -> Tensor[]", - [](torch::List tensors) { return tensors; }); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ListParameters) { + // Check that lists work well. + torch::RegisterOperators reg( + "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", + [](torch::List ints, + torch::List floats, + torch::List tensors) { return floats; }); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists"); + + ASSERT_EQ(op->schema().arguments().size(), 3); + ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); + ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); + ASSERT_TRUE( + op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); + ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); + + Stack stack; + push(stack, c10::List({1, 2})); + push(stack, c10::List({1.0, 2.0})); + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 2); + ASSERT_EQ(output.get(0), 1.0); + ASSERT_EQ(output.get(1), 2.0); +} - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists2"); +TEST(CustomOperatorTest, ListParameters2) { + torch::RegisterOperators reg( + "foo::lists2(Tensor[] tensors) -> Tensor[]", + [](torch::List tensors) { return tensors; }); - ASSERT_EQ(op->schema().arguments().size(), 1); - ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists2"); - Stack stack; - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); + ASSERT_EQ(op->schema().arguments().size(), 1); + ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); - ASSERT_EQ(output.size(), 1); - ASSERT_TRUE(output.get(0).allclose(at::ones(5))); - } + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + + Stack stack; + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 1); + ASSERT_TRUE(output.get(0).allclose(at::ones(5))); } -void testCustomOperatorAliasing() { +TEST(CustomOperatorTest, Aliasing) { torch::RegisterOperators reg( "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor { a.add_(b); @@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor): } } -void testIValueKWargs() { - const auto text = R"( - def foo(a : int, b : int, c : int = 4): - return a + 2*b + 3*c - )"; - auto cu = compile(text); - auto result = cu->get_function("foo")({1}, {{"b", 3}}); - ASSERT_EQ(result.toInt(), 19); -} - -void testTemplatedOperatorCreator() { - constexpr char op_list[] = "foofoo::bar.template;foo::another"; +static constexpr char op_list[] = "foofoo::bar.template;foo::another"; #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n) \ torch::detail::SelectiveStr(n) - { - // Try to register an op name that does not exist in op_list. - // Expected: the op name is not registered. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); - ASSERT_EQ(ops.size(), 0); - } +TEST(TestCustomOperator, OperatorGeneratorUndeclared) { + // Try to register an op name that does not exist in op_list. + // Expected: the op name is not registered. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); + ASSERT_EQ(ops.size(), 0); +} - { - // The operator should be successfully registered since its name is in the - // whitelist. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foofoo::bar"); - - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); - - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } +TEST(TestCustomOperator, OperatorGeneratorBasic) { + // The operator should be successfully registered since its name is in the + // whitelist. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foofoo::bar"); + + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); } } // namespace jit diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 953d1bf42fc0..92baba1168da 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -2225,5 +2225,15 @@ void testProfilerDisableInCallback() { t.join(); } +void testIValueKWargs() { + const auto text = R"( + def foo(a : int, b : int, c : int = 4): + return a + 2*b + 3*c + )"; + auto cu = compile(text); + auto result = cu->get_function("foo")({1}, {{"b", 3}}); + ASSERT_EQ(result.toInt(), 19); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 45d7f48b1f8a..8f43882c9e22 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -9,22 +9,14 @@ namespace torch { namespace jit { #define TH_FORALL_TESTS(_) \ - _(ADFormulas) \ _(Attributes) \ _(Blocks) \ _(CallStack) \ _(CallStackCaching) \ - _(CodeTemplate) \ _(ControlFlow) \ - _(CreateAutodiffSubgraphs) \ - _(CustomOperators) \ - _(CustomOperatorAliasing) \ - _(TemplatedOperatorCreator) \ _(IValueKWargs) \ _(CustomFusion) \ _(SchemaMatching) \ - _(Differentiate) \ - _(DifferentiateWithRequiresGrad) \ _(FromQualString) \ _(InternedStrings) \ _(PassManagement) \ @@ -35,12 +27,9 @@ namespace jit { _(SubgraphUtils) \ _(SubgraphUtilsVmap) \ _(IRParser) \ - _(ConstantPooling) \ - _(CleanUpPasses) \ _(THNNConv) \ _(ATenNativeBatchNorm) \ _(NoneSchemaMatch) \ - _(ClassParser) \ _(UnifyTypes) \ _(Profiler) \ _(FallbackGraphs) \ @@ -61,15 +50,11 @@ namespace jit { _(ModuleDeepcopyAliasing) \ _(ModuleDefine) \ _(QualifiedName) \ - _(ClassImport) \ - _(ScriptObject) \ _(ExtraFilesHookPreference) \ _(SaveExtraFilesHook) \ _(TypeTags) \ _(DCE) \ _(CustomFusionNestedBlocks) \ - _(ClassDerive) \ - _(SaveLoadTorchbind) \ _(ModuleInterfaceSerialization) \ _(ModuleCloneWithModuleInterface) \ _(ClassTypeAddRemoveAttr) \ @@ -100,7 +85,6 @@ namespace jit { _(LiteInterpreterHierarchyModuleInfo) \ _(LiteInterpreterDuplicatedClassTypeModuleInfo) \ _(LiteInterpreterEval) \ - _(TorchbindIValueAPI) \ _(LiteInterpreterDict) \ _(LiteInterpreterFindAndRunMethod) \ _(LiteInterpreterFindWrongMethodName) \ From 246bd9422a1f64965ad9082798c8b17f96bc2924 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 23 Sep 2020 14:26:03 -0700 Subject: [PATCH 063/449] gtestify dce and fuser tests (#45055) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45055 See https://github.com/pytorch/pytorch/pull/45018 for context. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D23811085 Pulled By: suo fbshipit-source-id: 45008e41f2394d2ba319745b0340392e1b3d3172 --- test/cpp/jit/test_dce.cpp | 6 +++--- test/cpp/jit/test_fuser.cpp | 41 +++++++++++++++++++------------------ test/cpp/jit/tests.h | 7 +------ 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp index 5799913c316a..6f9161d0d9ae 100644 --- a/test/cpp/jit/test_dce.cpp +++ b/test/cpp/jit/test_dce.cpp @@ -1,12 +1,12 @@ -#include -#include +#include +#include #include #include namespace torch { namespace jit { -void testDCE() { +TEST(EliminateDeadCodeTest, Basic) { auto graph = std::make_shared(); // Consider the following loop: diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp index ee0ea060f02f..ef595215b882 100644 --- a/test/cpp/jit/test_fuser.cpp +++ b/test/cpp/jit/test_fuser.cpp @@ -1,4 +1,4 @@ -#include "test/cpp/jit/test_base.h" +#include #include #include "ATen/core/interned_strings.h" @@ -56,28 +56,27 @@ namespace torch { namespace jit { -void testFusion() { - auto testSimple = [&] { - const auto graph_string = R"IR( +TEST(FuserTest, TestSimple_CUDA) { + const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : Tensor = aten::mul(%0, %1) return (%2))IR"; - Graph graph; - torch::jit::parseIR(graph_string, &graph); - - auto a = at::rand({3, 4}, at::kCUDA); - auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); - auto o = at::zeros({3, 4}, at::kCUDA); - auto outputs = debugLaunchGraph(graph, {a, b}); - ASSERT_EQ(outputs.size(), 1); - auto o2 = a * b; - float max_diff = (o2 - outputs[0]).abs().max().item(); - // std::cout << "max diff: " << max_diff << "\n"; - ASSERT_EQ(max_diff, 0); - }; - testSimple(); + Graph graph; + torch::jit::parseIR(graph_string, &graph); + + auto a = at::rand({3, 4}, at::kCUDA); + auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); + auto o = at::zeros({3, 4}, at::kCUDA); + auto outputs = debugLaunchGraph(graph, {a, b}); + ASSERT_EQ(outputs.size(), 1); + auto o2 = a * b; + float max_diff = (o2 - outputs[0]).abs().max().item(); + // std::cout << "max diff: " << max_diff << "\n"; + ASSERT_EQ(max_diff, 0); +} +TEST(FuserTest, TestOne_CUDA) { auto testOne = [&](int ti, int tj) { const auto graph_string = R"IR( graph(%0 : Tensor, @@ -132,7 +131,9 @@ void testFusion() { testOne(0, 1); testOne(1, 2); testOne(0, 2); +} +TEST(FuserTest, FusedConcat_CUDA) { const auto graph_string0 = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -175,7 +176,7 @@ void testFusion() { }; } -void testFusionAliasing() { +TEST(FuserTest, FusionAliasing) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -200,7 +201,7 @@ void testFusionAliasing() { ->run(*g); } -void testRegisterFusionCachesKernel() { +TEST(FuserTest, KernelCaching) { // Constructs two functionally equivalent graphs const auto graph0_string = R"IR( graph(%0 : Float(2, 3, 4), diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 8f43882c9e22..186aaaec2bba 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -21,7 +21,6 @@ namespace jit { _(InternedStrings) \ _(PassManagement) \ _(Proto) \ - _(RegisterFusionCachesKernel) \ _(SchemaParser) \ _(TopologicalIndex) \ _(SubgraphUtils) \ @@ -53,7 +52,6 @@ namespace jit { _(ExtraFilesHookPreference) \ _(SaveExtraFilesHook) \ _(TypeTags) \ - _(DCE) \ _(CustomFusionNestedBlocks) \ _(ModuleInterfaceSerialization) \ _(ModuleCloneWithModuleInterface) \ @@ -93,12 +91,10 @@ namespace jit { _(MobileSaveLoadParameters) \ _(MobileSaveLoadParametersEmpty) \ _(LiteSGD) \ - _(LiteSequentialSampler) \ - _(FusionAliasing) + _(LiteSequentialSampler) #if defined(USE_CUDA) #define TH_FORALL_TESTS_CUDA(_) \ - _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ @@ -203,7 +199,6 @@ namespace jit { _(GPU_FusionThreadPredicate) #else #define TH_FORALL_TESTS_CUDA(_) \ - _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ From 2a1a51facbba6f9be2cc80aa6b91d795666eda46 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 23 Sep 2020 14:49:02 -0700 Subject: [PATCH 064/449] Fix typos. (#45195) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45195 Fix some typos in reducer class. ghstack-source-id: 112673443 Test Plan: N/A Reviewed By: rohan-varma Differential Revision: D23862399 fbshipit-source-id: 0dc69e5ea1fa7d33c85d1909b2216bcd1f579f6a --- torch/csrc/distributed/c10d/reducer.cpp | 4 ++-- torch/csrc/distributed/c10d/reducer.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index a895bea5fc26..1a5766eea84e 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -190,7 +190,7 @@ Reducer::Reducer( // used to override how DDP communicates gradients across ranks, this can be // used for algorithms like Gradient Compression/GossipGrad. This hook can be // registered from Python API using `register_comm_hook`. `PythonCommHook` -// enables registering a Python hook and is a sub class of `CommHookInterface`. +// enables registering a Python hook and is a subclass of `CommHookInterface`. // `CommHookInterface` can be used to implement CPP hooks in the future. Reducer::~Reducer() noexcept(false) { @@ -493,7 +493,7 @@ void Reducer::autograd_hook(VariableIndex index) { // rebuilt_param_indices_ based on gradient arriving order, and then at the // end of finalize_backward(), buckets will be rebuilt based on // rebuilt_params_ and rebuilt_param_indices_, and then will be broadcasted - // and initialized. Also we only need to dump tensors and parameter indcies of + // and initialized. Also we only need to dump tensors and parameter indices of // one replica. push_rebuilt_params(index); diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index d45e5c2b90e1..3b441c99a3b6 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -179,7 +179,7 @@ class Reducer { // and on the same device can be batched. The tensor that represents the // flattened gradient uses the same type and is placed on the same device. // Buckets are filled as the gradients they hold are computed (triggered by - // autograd hooks). Buckets are reduced in a predetemined order that is + // autograd hooks). Buckets are reduced in a predetermined order that is // identical across processes. struct BucketReplica { // Flattened (1 dimensional) contents of bucket. From 8e0fc711f49cecac15a944ede4451703c3db1c02 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Wed, 23 Sep 2020 14:49:27 -0700 Subject: [PATCH 065/449] [TensorExpr] Remove unused EvalConstExpr function (#45180) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45180 Test Plan: build Reviewed By: ezyang Differential Revision: D23877151 Pulled By: asuhan fbshipit-source-id: a5d4d211c1dc85e6f7045330606163a933b9474e --- torch/csrc/jit/tensorexpr/loopnest.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index b7862fb953c1..2cbc7bdf186d 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -23,17 +23,6 @@ namespace torch { namespace jit { namespace tensorexpr { -namespace { - -// Evaluates a constant expression and returns its value. -template -static T EvalConstExpr(const ExprHandle& expr) { - ExprEval eval(expr); - return eval.value(); -} - -} // namespace - class IndexFlattener : public IRMutator { public: Stmt* flatten(Stmt* s) { From 049599886289a5ddb2b16a58564a66ecabac0142 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Wed, 23 Sep 2020 14:53:17 -0700 Subject: [PATCH 066/449] [TensorExpr] Disallow arithmetic binary operations on Bool (#44677) Summary: Arithmetic operations on Bool aren't fully supported in the evaluator. Moreover, such semantics can be implemented by the client code through insertion of explicit casts to widen and narrow to the desired types. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44677 Test Plan: test_tensorexpr --gtest_filter=TensorExprTest.ExprDisallowBoolArithmetic python test/test_jit_fuser_te.py Reviewed By: agolynski Differential Revision: D23801412 Pulled By: asuhan fbshipit-source-id: fff5284e3a216655dbf5a9a64d1cb1efda271a36 --- test/cpp/tensorexpr/test_expr.cpp | 18 +++++++ test/cpp/tensorexpr/tests.h | 1 + test/test_jit_fuser_te.py | 77 ++++++++++++++++++++++++++++ torch/csrc/jit/tensorexpr/kernel.cpp | 22 +++++--- torch/csrc/jit/tensorexpr/types.h | 4 ++ 5 files changed, 116 insertions(+), 6 deletions(-) diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp index c1386a85764b..e94e70aa6b38 100644 --- a/test/cpp/tensorexpr/test_expr.cpp +++ b/test/cpp/tensorexpr/test_expr.cpp @@ -164,6 +164,24 @@ void testExprDoubleTest() { ASSERT_EQ(eval.value(), 2 + (3 * 3 + 4)); } +void testExprDisallowBoolArithmetic() { + KernelScope kernel_scope; + VarHandle x("x", kBool); + VarHandle y("y", kBool); + std::string error{"arithmetic binary operations on Bool not supported"}; + ASSERT_THROWS_WITH((x + y), error); + ASSERT_THROWS_WITH((x - y), error); + ASSERT_THROWS_WITH((x * y), error); + ASSERT_THROWS_WITH((x / y), error); + ASSERT_THROWS_WITH((x & y), error); + ASSERT_THROWS_WITH((x | y), error); + ASSERT_THROWS_WITH((x ^ y), error); + ASSERT_THROWS_WITH((x << y), error); + ASSERT_THROWS_WITH((x >> y), error); + ASSERT_THROWS_WITH(Max::make(x, y, /*propagate_nans=*/true), error); + ASSERT_THROWS_WITH(Min::make(x, y, /*propagate_nans=*/true), error); +} + void testExprVectorAdd01() { KernelScope kernel_scope; const int kVectorSize = 8; diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index d0a9aa840b91..2d42a4a93967 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -23,6 +23,7 @@ namespace jit { _(ExprLongTest) \ _(ExprHalfTest) \ _(ExprDoubleTest) \ + _(ExprDisallowBoolArithmetic) \ _(ExprVectorAdd01) \ _(ExprCompareSelectEQ) \ _(ExprCompareSelectDtypes) \ diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 453047eca8be..f9aca9a5dea1 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1,5 +1,6 @@ from collections import defaultdict +import operator import unittest import contextlib import torch @@ -459,6 +460,82 @@ def func(x): graph = backward_graph(s, skip_check=True) self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'}) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_add_bool(self): + def f(x, y, z): + return x + y + z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_mul_bool(self): + def f(x, y, z): + return x * y * z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_div_bool(self): + def f(x, y, z): + return (x + y) / z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.ones_like(x, dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_bitwise_ops(self): + def apply(fn): + return lambda x, y, z: fn(fn(x, y), z) + + dtypes = [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + torch.bool, + ] + binary_ops = [ + operator.__and__, + operator.__or__, + operator.__xor__ + ] + devices = ["cuda"] + for dtype, op, device in product(dtypes, binary_ops, devices): + try: + x = self.data_for(dtype, device) + y = self.data_for(dtype, device) + z = self.data_for(dtype, device) + fn = apply(op) + ref = fn(x, y, z) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x, y, z)) + self.assertEqual(ref, t(x, y, z)) + self.assertAllFused(t.graph_for(x, y, z)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device]) + ) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_comparison_eq_ne(self): def f(x, y): diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 5cd414bbe2df..293ea780ed27 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -673,11 +673,20 @@ Tensor* TensorExprKernel::computeFourOperand( }); } +namespace { + +// Convert boolean to integer, if needed. +ExprHandle boolToInteger(const ExprHandle& x) { + return x.dtype().scalar_type() == ScalarType::Bool ? cast(x) : x; +} + +} // namespace + Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { switch (v->node()->kind()) { case aten::add: { auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs + rhs; + return boolToInteger(lhs) + boolToInteger(rhs); }; TORCH_INTERNAL_ASSERT( v->node()->inputs().size() == 2 || v->node()->inputs().size() == 3); @@ -694,6 +703,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::sub: { auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) { + // NB: sub isn't supported on boolean, no need to promote to integer. return lhs - rhs; }; TORCH_INTERNAL_ASSERT( @@ -706,35 +716,35 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::mul: { return computeTwoOperand( "aten_mul", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs * rhs; + return boolToInteger(lhs) * boolToInteger(rhs); }); } break; case aten::div: { return computeTwoOperand( "aten_div", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs / rhs; + return boolToInteger(lhs) / boolToInteger(rhs); }); } break; case aten::__and__: { return computeTwoOperand( "aten_and", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs & rhs; + return boolToInteger(lhs) & boolToInteger(rhs); }); } break; case aten::__or__: { return computeTwoOperand( "aten_or", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs | rhs; + return boolToInteger(lhs) | boolToInteger(rhs); }); } break; case aten::__xor__: { return computeTwoOperand( "aten_xor", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return lhs ^ rhs; + return boolToInteger(lhs) ^ boolToInteger(rhs); }); } break; diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h index 8dd67c8b7125..8e39ad231545 100644 --- a/torch/csrc/jit/tensorexpr/types.h +++ b/torch/csrc/jit/tensorexpr/types.h @@ -124,6 +124,10 @@ inline Dtype BinaryOpDtype( Dtype op1_dtype, Dtype op2_dtype, ScalarType ret_type = ScalarType::None) { + if (op1_dtype.scalar_type() == ScalarType::Bool || + op2_dtype.scalar_type() == ScalarType::Bool) { + throw malformed_input("arithmetic binary operations on Bool not supported"); + } if (op1_dtype == op2_dtype) { if (ret_type == ScalarType::None) { return op1_dtype; From f93ead6d37b476576b50a2f550c5898415a1fe35 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Sep 2020 15:37:50 -0700 Subject: [PATCH 067/449] [quant][eagermode] Custom module support (#44835) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44835 This is for feature parity with fx graph mode quantization Test Plan: Imported from OSS Reviewed By: z-a-f Differential Revision: D23745086 fbshipit-source-id: ae2fc86129f9896d5a9039b73006a4da15821307 --- test/quantization/test_quantize.py | 113 ++++++++++++++++++ torch/quantization/__init__.py | 1 + torch/quantization/quantize.py | 87 +++++++++----- .../testing/_internal/common_quantization.py | 23 +++- 4 files changed, 193 insertions(+), 31 deletions(-) diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index 91594da111c1..e54eb33770c2 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -14,6 +14,8 @@ fuse_modules, quantize_dynamic, QuantWrapper, + QuantStub, + DeQuantStub, QConfig, default_qconfig, default_qat_qconfig, @@ -21,6 +23,8 @@ per_channel_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig, + register_observed_custom_module_mapping, + register_quantized_custom_module_mapping, ) from torch.testing._internal.common_quantization import ( @@ -571,6 +575,115 @@ def forward(self, indices, offsets, per_sample_weights, linear_in): self.checkLinear(model.fc) self.checkDynamicQuantizedModule(quantized_model.emb, torch.nn.quantized.EmbeddingBag, torch.quint8) + @skipIfNoFBGEMM + def test_custom_module_class(self): + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv(x) + + class ObservedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_float(cls, float_module): + assert hasattr(float_module, 'qconfig') + observed = cls(float_module.conv) + observed.qconfig = float_module.qconfig + return observed + + class QuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + observed_module.conv.activation_post_process = \ + observed_module.activation_post_process + quantized = cls(nnq.Conv2d.from_float(observed_module.conv)) + return quantized + + register_observed_custom_module_mapping(CustomModule, ObservedCustomModule) + register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.custom = CustomModule() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.custom(x) + x = self.dequant(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv1(x) + x = self.conv2(x) + x = self.dequant(x) + return x + + data = torch.randn(1, 1, 1, 1) + # instantiate M and RefM and align the parameters + original_m = M() + original_ref_m = RefM() + original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach()) + original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach()) + + original_m.qconfig = default_qconfig + m = prepare(original_m) + self.checkObservers(m) + # calibration + m(data) + # all activation observers are inserted in the top level module + + # check converted/quantized model + m = convert(m) + # check if the module is properly quantized + self.assertEqual(type(m.quant), nnq.Quantize) + self.assertEqual(type(m.conv), nnq.Conv2d) + self.assertEqual(type(m.custom.conv), nnq.Conv2d) + self.assertEqual(type(m.dequant), nnq.DeQuantize) + res = m(data) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = default_qconfig + ref_m = prepare(original_ref_m) + ref_m(data) + ref_m = convert(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + @skipIfNoFBGEMM class TestPostTrainingDynamic(QuantizationTestCase): diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py index 3193c332469f..31943e56e6a3 100644 --- a/torch/quantization/__init__.py +++ b/torch/quantization/__init__.py @@ -46,6 +46,7 @@ def default_eval_fn(model, calib_data): 'register_quantized_custom_mdoule_mapping', 'get_quantized_custom_module_class', 'is_custom_module_class', + 'is_observed_custom_module', # Sub functions for `prepare` and `swap_module` 'propagate_qconfig_', 'add_quant_dequant', 'add_observer_', 'swap_module', 'default_eval_fn', 'get_observer_dict', diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index 8bc3b6ffc532..19a27e62ac5b 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -14,6 +14,14 @@ get_qat_module_mappings, get_qconfig_propagation_list) +from .custom_module_class_mappings import ( + is_custom_module_class, + get_observed_custom_module_class, + get_quantized_custom_module_class, + mark_observed_custom_module, + is_observed_custom_module, +) + from .stubs import DeQuantStub, QuantWrapper from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig @@ -117,38 +125,52 @@ def get_activation_post_process(qconfig, device): activation.to(device) return activation - for child in module.children(): + def needs_observation(m): + return hasattr(m, 'qconfig') and m.qconfig is not None + + def insert_activation_post_process(m): + """ Adds an activation post process module and register + a post hook that calls the module + """ + if needs_observation(m): + # observer and hook will be gone after we swap the module + m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device)) + # Register observer as the first entry in the hook list + # All post forward hooks are preserved and will be executed after the observer before convert + handle = register_activation_post_process_hook(m) + m._forward_hooks.move_to_end(handle.id, last=False) + + for name, child in module.named_children(): if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional: if hasattr(child, 'qconfig') and child.qconfig is not None: child.activation_post_process = get_activation_post_process(child.qconfig, device) elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: - if hasattr(child, 'qconfig') and child.qconfig is not None: - child.add_module('activation_post_process', get_activation_post_process(child.qconfig, device)) - register_activation_post_process_hook(child) - + insert_activation_post_process(child) + # TODO: remove + if needs_observation(child): # Attaching prehook if prehook is not None: child.add_module('activation_pre_process', prehook()) child.register_forward_pre_hook(_observer_forward_pre_hook) + elif needs_observation(child) and is_custom_module_class(type(child)): + observed_child = get_observed_custom_module_class(type(child)).from_float(child) + mark_observed_custom_module(observed_child, type(child)) + setattr(module, name, observed_child) + insert_activation_post_process(observed_child) else: add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, prehook) # Insert observers only for leaf nodes, note that this observer is for # the output of the module, for input QuantStub will observe them - if hasattr(module, 'qconfig') and module.qconfig is not None and \ - len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ + if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ and type(module) in qconfig_propagation_list: - # observer and hook will be gone after we swap the module - module.add_module('activation_post_process', get_activation_post_process(module.qconfig, device)) - # Register observer as the first entry in the hook list - # All post forward hooks are preserved and will be executed after the observer before convert - handle = register_activation_post_process_hook(module) - module._forward_hooks.move_to_end(handle.id, last=False) - - # Attaching prehook - if prehook is not None: - module.add_module('activation_pre_process', prehook()) - module.register_forward_pre_hook(_observer_forward_pre_hook) + insert_activation_post_process(module) + # TOOD: remove + if needs_observation(module): + # Attaching prehook + if prehook is not None: + module.add_module('activation_pre_process', prehook()) + module.register_forward_pre_hook(_observer_forward_pre_hook) def get_unique_devices_(module): return {p.device for p in module.parameters()} | \ @@ -429,7 +451,10 @@ def _convert(module, mapping=None, inplace=False): nniqat.ConvBnReLU2d) for name, mod in module.named_children(): - if type(mod) not in SWAPPABLE_MODULES: + # both swappable modules and observed custom modules are + # swapped as one unit + if type(mod) not in SWAPPABLE_MODULES and \ + not is_observed_custom_module(mod): _convert(mod, mapping, inplace=True) reassign[name] = swap_module(mod, mapping) @@ -452,15 +477,15 @@ def swap_module(mod, mapping): new_mod = mod # Always replace dequantstub with dequantize if hasattr(mod, 'qconfig') and mod.qconfig is not None or type(mod) == DeQuantStub: - if type(mod) in mapping: - # respect device affinity when swapping modules - devices = get_unique_devices_(mod) - assert len(devices) <= 1, ( - "swap_module only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None + swapped = False + if is_observed_custom_module(mod): + new_mod = get_quantized_custom_module_class(mod._FLOAT_MODULE).from_observed(mod) + swapped = True + elif type(mod) in mapping: new_mod = mapping[type(mod)].from_float(mod) + swapped = True + + if swapped: # Preserve module's pre forward hooks. They'll be called on quantized input for pre_hook_fn in mod._forward_pre_hooks.values(): new_mod.register_forward_pre_hook(pre_hook_fn) @@ -469,6 +494,14 @@ def swap_module(mod, mapping): for hook_fn in mod._forward_hooks.values(): if hook_fn is not _observer_forward_hook: new_mod.register_forward_hook(hook_fn) + + # respect device affinity when swapping modules + devices = get_unique_devices_(mod) + assert len(devices) <= 1, ( + "swap_module only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None if device: new_mod.to(device) return new_mod diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 3edbd5dd7fcd..4031b2fdd0de 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -13,6 +13,10 @@ default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \ propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \ get_default_qat_qconfig +from torch.quantization import ( + is_custom_module_class, + is_observed_custom_module, +) from torch.quantization.quantization_mappings import ( get_dynamic_quant_module_mappings, get_qconfig_propagation_list, @@ -344,14 +348,25 @@ def checkObservers(self, module, propagate_qconfig_list=None): """ if propagate_qconfig_list is None: propagate_qconfig_list = get_qconfig_propagation_list() - if hasattr(module, 'qconfig') and module.qconfig is not None and \ - len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ - and type(module) in propagate_qconfig_list: + + # check if a module is a leaf module, ignoring activation_post_process attribute + def is_leaf_module(module): + submodule_name_count = 0 + for name, _ in module.named_children(): + if name != 'activation_post_process': + submodule_name_count += 1 + return submodule_name_count == 0 + + if (hasattr(module, 'qconfig') and module.qconfig is not None and + is_leaf_module(module) and not isinstance(module, torch.nn.Sequential) + and type(module) in propagate_qconfig_list) or \ + is_custom_module_class(type(module)): self.assertTrue(hasattr(module, 'activation_post_process'), 'module: ' + str(type(module)) + ' do not have observer') # we don't need to check observers for child modules of the # qat modules - if type(module) not in get_qat_module_mappings().values(): + if type(module) not in get_qat_module_mappings().values() and \ + not is_observed_custom_module(module): for child in module.children(): self.checkObservers(child) From 76c185dccaca99b753d51a5c3eae6f8c67e61f82 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Wed, 23 Sep 2020 17:03:48 -0700 Subject: [PATCH 068/449] [TensorExpr] When lanes differ, insert Broadcast instead of Cast (#45179) Summary: We need to check if dtypes differ in scalar type or lanes to decide between Cast and Broadcast. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45179 Test Plan: test_tensorexpr --gtest_filter=TensorExprTest.SimplifyBroadcastTermExpander Reviewed By: bwasti Differential Revision: D23873316 Pulled By: asuhan fbshipit-source-id: ca141be67e10c2b6c5f2ff9c11e42dcfc62ac620 --- test/cpp/tensorexpr/test_simplify.cpp | 26 +++++++++++++++++++++ test/cpp/tensorexpr/tests.h | 1 + torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 20 +++++++++++++--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index f8c5cdd3546d..f0185884fc58 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -3964,5 +3964,31 @@ void testSimplifyRampSubBroadcast() { ASSERT_EQ(newRamp->lanes(), num_lanes); } +void testSimplifyBroadcastTermExpander() { + KernelScope kernel_scope; + int num_lanes = 8; + ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes); + ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes); + ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes); + // NB: We need a term in the middle which isn't simplified to trigger the + // relevant path in TermExpander::mutate. The two bc1 terms are brought + // together and simplified to 2 * bc1, which then needs to make 2 multi-lane. + ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1); + Buffer buf(BufHandle("buf", {num_lanes}, kInt)); + // The result isn't fully simplified currently and thus would be brittle to + // match. Observe its value instead. + auto store = Store::make( + buf, + {Ramp::make(0, 1, num_lanes)}, + simplified, + Broadcast::make(ExprHandle(1), num_lanes)); + SimpleIREvaluator eval(store, buf); + std::vector output(num_lanes); + eval(output); + for (int i = 0; i < num_lanes; ++i) { + ASSERT_EQ(output[i], 2); + } +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index 2d42a4a93967..34eeaa0de19a 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -219,6 +219,7 @@ namespace jit { _(SimplifyFuseConditions) \ _(SimplifySyncThreads) \ _(SimplifyRampSubBroadcast) \ + _(SimplifyBroadcastTermExpander) \ _(RegisterizerSimple) \ _(RegisterizerLoop) \ _(RegisterizerLoopFixedLoad) \ diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index f6852b627969..37c856a2e618 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -1503,9 +1503,23 @@ const Expr* TermExpander::mutate(const Term* v) { if (lastNode) { // We want to avoid a leaving a CastNode on the scalar, so handle that // now. - if (v->scalar()->dtype() != lastNode->dtype()) { - lastNode = new Mul( - evaluateOp(new Cast(lastNode->dtype(), v->scalar())), lastNode); + auto termDtype = v->scalar()->dtype(); + auto lastNodeDtype = lastNode->dtype(); + if (termDtype != lastNodeDtype) { + const Expr* castV = v->scalar(); + // Take care of lane mismatch first. + if (termDtype.lanes() != lastNodeDtype.lanes()) { + castV = new Broadcast(v->scalar(), lastNodeDtype.lanes()); + } + // Now take care of scalar type as well. + if (termDtype.scalar_type() != lastNodeDtype.scalar_type()) { + castV = new Cast(lastNode->dtype(), castV); + // For scalars, we can simplify the cast further. + if (lastNodeDtype.lanes() == 1) { + castV = evaluateOp(castV); + } + } + lastNode = new Mul(castV, lastNode); } else { lastNode = new Mul(v->scalar(), lastNode); } From 89c570ed0a1bfc096e2e299637c7c62831c3dd26 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 23 Sep 2020 17:25:57 -0700 Subject: [PATCH 069/449] Revert D23811085: gtestify dce and fuser tests Test Plan: revert-hammer Differential Revision: D23811085 (https://github.com/pytorch/pytorch/commit/246bd9422a1f64965ad9082798c8b17f96bc2924) Original commit changeset: 45008e41f239 fbshipit-source-id: 94c981f565cab9b710fe52a55bbe8dbf9c179c23 --- test/cpp/jit/test_dce.cpp | 6 +++--- test/cpp/jit/test_fuser.cpp | 41 ++++++++++++++++++------------------- test/cpp/jit/tests.h | 7 ++++++- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp index 6f9161d0d9ae..5799913c316a 100644 --- a/test/cpp/jit/test_dce.cpp +++ b/test/cpp/jit/test_dce.cpp @@ -1,12 +1,12 @@ -#include +#include +#include -#include #include #include namespace torch { namespace jit { -TEST(EliminateDeadCodeTest, Basic) { +void testDCE() { auto graph = std::make_shared(); // Consider the following loop: diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp index ef595215b882..ee0ea060f02f 100644 --- a/test/cpp/jit/test_fuser.cpp +++ b/test/cpp/jit/test_fuser.cpp @@ -1,4 +1,4 @@ -#include +#include "test/cpp/jit/test_base.h" #include #include "ATen/core/interned_strings.h" @@ -56,27 +56,28 @@ namespace torch { namespace jit { -TEST(FuserTest, TestSimple_CUDA) { - const auto graph_string = R"IR( +void testFusion() { + auto testSimple = [&] { + const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : Tensor = aten::mul(%0, %1) return (%2))IR"; - Graph graph; - torch::jit::parseIR(graph_string, &graph); - - auto a = at::rand({3, 4}, at::kCUDA); - auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); - auto o = at::zeros({3, 4}, at::kCUDA); - auto outputs = debugLaunchGraph(graph, {a, b}); - ASSERT_EQ(outputs.size(), 1); - auto o2 = a * b; - float max_diff = (o2 - outputs[0]).abs().max().item(); - // std::cout << "max diff: " << max_diff << "\n"; - ASSERT_EQ(max_diff, 0); -} + Graph graph; + torch::jit::parseIR(graph_string, &graph); + + auto a = at::rand({3, 4}, at::kCUDA); + auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); + auto o = at::zeros({3, 4}, at::kCUDA); + auto outputs = debugLaunchGraph(graph, {a, b}); + ASSERT_EQ(outputs.size(), 1); + auto o2 = a * b; + float max_diff = (o2 - outputs[0]).abs().max().item(); + // std::cout << "max diff: " << max_diff << "\n"; + ASSERT_EQ(max_diff, 0); + }; + testSimple(); -TEST(FuserTest, TestOne_CUDA) { auto testOne = [&](int ti, int tj) { const auto graph_string = R"IR( graph(%0 : Tensor, @@ -131,9 +132,7 @@ TEST(FuserTest, TestOne_CUDA) { testOne(0, 1); testOne(1, 2); testOne(0, 2); -} -TEST(FuserTest, FusedConcat_CUDA) { const auto graph_string0 = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -176,7 +175,7 @@ TEST(FuserTest, FusedConcat_CUDA) { }; } -TEST(FuserTest, FusionAliasing) { +void testFusionAliasing() { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -201,7 +200,7 @@ TEST(FuserTest, FusionAliasing) { ->run(*g); } -TEST(FuserTest, KernelCaching) { +void testRegisterFusionCachesKernel() { // Constructs two functionally equivalent graphs const auto graph0_string = R"IR( graph(%0 : Float(2, 3, 4), diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 186aaaec2bba..8f43882c9e22 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -21,6 +21,7 @@ namespace jit { _(InternedStrings) \ _(PassManagement) \ _(Proto) \ + _(RegisterFusionCachesKernel) \ _(SchemaParser) \ _(TopologicalIndex) \ _(SubgraphUtils) \ @@ -52,6 +53,7 @@ namespace jit { _(ExtraFilesHookPreference) \ _(SaveExtraFilesHook) \ _(TypeTags) \ + _(DCE) \ _(CustomFusionNestedBlocks) \ _(ModuleInterfaceSerialization) \ _(ModuleCloneWithModuleInterface) \ @@ -91,10 +93,12 @@ namespace jit { _(MobileSaveLoadParameters) \ _(MobileSaveLoadParametersEmpty) \ _(LiteSGD) \ - _(LiteSequentialSampler) + _(LiteSequentialSampler) \ + _(FusionAliasing) #if defined(USE_CUDA) #define TH_FORALL_TESTS_CUDA(_) \ + _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ @@ -199,6 +203,7 @@ namespace jit { _(GPU_FusionThreadPredicate) #else #define TH_FORALL_TESTS_CUDA(_) \ + _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ From e9aa6898ab83988f6f3f5df351907e77e7cd38be Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 23 Sep 2020 17:40:15 -0700 Subject: [PATCH 070/449] Revert D23802296: gtest-ify JIT tests, through the letter c Test Plan: revert-hammer Differential Revision: D23802296 (https://github.com/pytorch/pytorch/commit/d2b045030eb60283b8aeeb2956c7ebe91628fece) Original commit changeset: 20c9798a414e fbshipit-source-id: a28d56039ca404fe94ed7572f1febd1673e3e788 --- test/cpp/jit/test_autodiff.cpp | 9 +- test/cpp/jit/test_class_import.cpp | 12 +- test/cpp/jit/test_class_parser.cpp | 4 +- test/cpp/jit/test_cleanup_passes.cpp | 37 +- test/cpp/jit/test_code_template.cpp | 50 +-- test/cpp/jit/test_constant_pooling.cpp | 87 +++-- .../jit/test_create_autodiff_subgraphs.cpp | 5 +- test/cpp/jit/test_custom_class.cpp | 4 +- test/cpp/jit/test_custom_operators.cpp | 342 +++++++++--------- test/cpp/jit/test_misc.cpp | 10 - test/cpp/jit/tests.h | 16 + 11 files changed, 294 insertions(+), 282 deletions(-) diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp index 3993c63b1708..7d431776a971 100644 --- a/test/cpp/jit/test_autodiff.cpp +++ b/test/cpp/jit/test_autodiff.cpp @@ -1,5 +1,4 @@ -#include - +#include "test/cpp/jit/test_base.h" #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/tracer.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" @@ -84,7 +83,7 @@ variable_list grad( fmap(inputs, get_edge)); } -TEST(AutodiffTest, ADFormulas) { +void testADFormulas() { const auto cast = [](const Variable& v) { return static_cast(v); }; @@ -175,7 +174,7 @@ TEST(AutodiffTest, ADFormulas) { } } -TEST(AutodiffTest, Differentiate) { +void testDifferentiate() { // Note: can't use IRParser for this test due to issue #23989 auto graph = std::make_shared(); std::vector sizes{2, 3, 4}; @@ -230,7 +229,7 @@ TEST(AutodiffTest, Differentiate) { ->run(*grad_spec.df); } -TEST(AutodiffTest, DifferentiateWithRequiresGrad) { +void testDifferentiateWithRequiresGrad() { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp index ffa845b3e2a8..82bc0cf3bccc 100644 --- a/test/cpp/jit/test_class_import.cpp +++ b/test/cpp/jit/test_class_import.cpp @@ -1,7 +1,7 @@ -#include +#include +#include #include -#include #include #include #include @@ -45,7 +45,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -TEST(ClassImportTest, Basic) { +void testClassImport() { auto cu1 = std::make_shared(); auto cu2 = std::make_shared(); std::vector constantTable; @@ -80,7 +80,7 @@ TEST(ClassImportTest, Basic) { ASSERT_FALSE(c); } -TEST(ClassImportTest, ScriptObject) { +void testScriptObject() { Module m1("m1"); Module m2("m2"); std::vector constantTable; @@ -114,7 +114,7 @@ def __init__(self, x): return x )JIT"; -TEST(ClassImportTest, ClassDerive) { +void testClassDerive() { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu); const auto self = SimpleSelf(cls); @@ -142,7 +142,7 @@ class FooBar1234(Module): return (self.f).top() )JIT"; -TEST(ClassImportTest, CustomClass) { +void testSaveLoadTorchbind() { auto cu1 = std::make_shared(); std::vector constantTable; // Import different versions of FooTest into two namespaces. diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp index a5b19f63fd3f..45e37103bb5a 100644 --- a/test/cpp/jit/test_class_parser.cpp +++ b/test/cpp/jit/test_class_parser.cpp @@ -1,5 +1,3 @@ -#include - #include #include #include @@ -17,7 +15,7 @@ const auto testSource = R"JIT( an_attribute : Tensor )JIT"; -TEST(ClassParserTest, Basic) { +void testClassParser() { Parser p(std::make_shared(testSource)); std::vector definitions; std::vector resolvers; diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp index 38ceef932eb0..2f2ca4e0a19b 100644 --- a/test/cpp/jit/test_cleanup_passes.cpp +++ b/test/cpp/jit/test_cleanup_passes.cpp @@ -1,19 +1,19 @@ -#include - #include #include #include #include +#include "test/cpp/jit/test_base.h" namespace torch { namespace jit { -TEST(CleanupPassTest, Basic) { +void testCleanUpPasses() { // Tests stability of clean up passes when dealing with constant pooling // and constant propagation. - auto graph = std::make_shared(); - parseIR( - R"IR( + { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond.1 : Tensor, %suffix.1 : str): %3 : bool = aten::Bool(%cond.1) # o.py:6:7 @@ -31,19 +31,20 @@ graph(%cond.1 : Tensor, -> (%12) return (%25) )IR", - &*graph); - runCleanupPasses(graph); - testing::FileCheck() - .check_count( - "prim::Constant[value=\"same string with a twist\"]", - 1, - /*exactly=*/true) - ->run(*graph); + &*graph); + runCleanupPasses(graph); + testing::FileCheck() + .check_count( + "prim::Constant[value=\"same string with a twist\"]", + 1, + /*exactly=*/true) + ->run(*graph); - auto graph_after_pass_once = graph->toString(); - runCleanupPasses(graph); - auto graph_after_pass_twice = graph->toString(); - ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); + auto graph_after_pass_once = graph->toString(); + runCleanupPasses(graph); + auto graph_after_pass_twice = graph->toString(); + ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); + } } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp index 35897474f1f2..e4d7d1ef856e 100644 --- a/test/cpp/jit/test_code_template.cpp +++ b/test/cpp/jit/test_code_template.cpp @@ -1,6 +1,6 @@ -#include +#include "test/cpp/jit/test_base.h" +#include "test/cpp/jit/test_utils.h" -#include #include "torch/csrc/jit/frontend/code_template.h" namespace torch { @@ -33,29 +33,31 @@ static const auto ct_expect = R"( int notest(int a) )"; -TEST(TestCodeTemplate, Copying) { - TemplateEnv e; - e.s("hi", "foo"); - e.v("what", {"is", "this"}); - TemplateEnv c(e); - c.s("hi", "foo2"); - ASSERT_EQ(e.s("hi"), "foo"); - ASSERT_EQ(c.s("hi"), "foo2"); - ASSERT_EQ(e.v("what")[0], "is"); -} +void testCodeTemplate() { + { + TemplateEnv e; + e.s("hi", "foo"); + e.v("what", {"is", "this"}); + TemplateEnv c(e); + c.s("hi", "foo2"); + ASSERT_EQ(e.s("hi"), "foo"); + ASSERT_EQ(c.s("hi"), "foo2"); + ASSERT_EQ(e.v("what")[0], "is"); + } -TEST(TestCodeTemplate, Formatting) { - TemplateEnv e; - e.v("args", {"hi", "8"}); - e.v("bar", {"what\non many\nlines...", "7"}); - e.s("a", "3"); - e.s("b", "4"); - e.v("stuff", {"things...", "others"}); - e.v("empty", {}); - auto s = ct.format(e); - // std::cout << "'" << s << "'\n"; - // std::cout << "'" << ct_expect << "'\n"; - ASSERT_EQ(s, ct_expect); + { + TemplateEnv e; + e.v("args", {"hi", "8"}); + e.v("bar", {"what\non many\nlines...", "7"}); + e.s("a", "3"); + e.s("b", "4"); + e.v("stuff", {"things...", "others"}); + e.v("empty", {}); + auto s = ct.format(e); + // std::cout << "'" << s << "'\n"; + // std::cout << "'" << ct_expect << "'\n"; + ASSERT_EQ(s, ct_expect); + } } } // namespace jit diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp index c8cb58e1886a..b949c9a45b25 100644 --- a/test/cpp/jit/test_constant_pooling.cpp +++ b/test/cpp/jit/test_constant_pooling.cpp @@ -1,10 +1,9 @@ -#include - #include #include #include #include #include +#include "test/cpp/jit/test_base.h" #include #include @@ -12,26 +11,26 @@ namespace torch { namespace jit { -TEST(ConstantPoolingTest, Int) { - auto graph = std::make_shared(); - parseIR( - R"IR( +void testConstantPooling() { + { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %8 : int = prim::Constant[value=1]() %10 : int = prim::Constant[value=1]() return (%8, %10) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant", 1, /*exactly*/ true) - ->run(*graph); -} - -TEST(ConstantPoolingTest, PoolingAcrossBlocks) { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant", 1, /*exactly*/ true) + ->run(*graph); + } + { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond : Tensor): %a : str = prim::Constant[value="bcd"]() %3 : bool = aten::Bool(%cond) @@ -45,18 +44,17 @@ graph(%cond : Tensor): %7 : (str, str) = prim::TupleConstruct(%a, %b) return (%7) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) - ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) - ->run(*graph); -} - -TEST(ConstantPoolingTest, PoolingDifferentDevices) { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) + ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) + ->run(*graph); + } + { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %2 : int = prim::Constant[value=2]() %1 : int = prim::Constant[value=1]() @@ -72,21 +70,22 @@ graph(): prim::Print(%x, %y, %z) return (%1) )IR", - &*graph); - // three tensors created - two different devices among the three - // don't have good support for parsing tensor constants - ConstantPropagation(graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count( - "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->check_count( - "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->run(*graph); + &*graph); + // three tensors created - two different devices among the three + // don't have good support for parsing tensor constants + ConstantPropagation(graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count( + "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->check_count( + "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->run(*graph); + } } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp index e97043f84d24..8da6d9d6a1b2 100644 --- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp +++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp @@ -1,5 +1,4 @@ -#include - +#include "test/cpp/jit/test_base.h" #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" @@ -7,7 +6,7 @@ namespace torch { namespace jit { -TEST(CreateAutodiffSubgraphsTest, Basic) { +void testCreateAutodiffSubgraphs() { auto graph = build_lstm(); CreateAutodiffSubgraphs(graph, /*threshold=*/2); // all of the ops are within the DifferentiableGraph diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp index 25c518d3142c..543fbc20eb3d 100644 --- a/test/cpp/jit/test_custom_class.cpp +++ b/test/cpp/jit/test_custom_class.cpp @@ -1,5 +1,3 @@ -#include - #include #include @@ -320,7 +318,7 @@ TORCH_LIBRARY(_TorchScriptTesting, m) { } // namespace -TEST(CustomClassTest, TorchbindIValueAPI) { +void testTorchbindIValueAPI() { script::Module m("m"); // test make_custom_class API diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp index d3f61268e8f1..529b36385bd4 100644 --- a/test/cpp/jit/test_custom_operators.cpp +++ b/test/cpp/jit/test_custom_operators.cpp @@ -1,5 +1,4 @@ -#include - +#include "test/cpp/jit/test_base.h" #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/alias_analysis.h" @@ -12,135 +11,134 @@ namespace torch { namespace jit { -TEST(CustomOperatorTest, InferredSchema) { - torch::RegisterOperators reg( - "foo::bar", [](double a, at::Tensor b) { return a + b; }); - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar"); - - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); +void testCustomOperators() { + { + torch::RegisterOperators reg( + "foo::bar", [](double a, at::Tensor b) { return a + b; }); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar"); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); -} + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); -TEST(CustomOperatorTest, ExplicitSchema) { - torch::RegisterOperators reg( - "foo::bar_with_schema(float a, Tensor b) -> Tensor", - [](double a, at::Tensor b) { return a + b; }); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); - auto& ops = - getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); + } + { + torch::RegisterOperators reg( + "foo::bar_with_schema(float a, Tensor b) -> Tensor", + [](double a, at::Tensor b) { return a + b; }); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + auto& ops = + getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); -} + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); -TEST(CustomOperatorTest, ListParameters) { - // Check that lists work well. - torch::RegisterOperators reg( - "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", - [](torch::List ints, - torch::List floats, - torch::List tensors) { return floats; }); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists"); - - ASSERT_EQ(op->schema().arguments().size(), 3); - ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); - ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); - ASSERT_TRUE( - op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); - ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); - - Stack stack; - push(stack, c10::List({1, 2})); - push(stack, c10::List({1.0, 2.0})); - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); - - ASSERT_EQ(output.size(), 2); - ASSERT_EQ(output.get(0), 1.0); - ASSERT_EQ(output.get(1), 2.0); -} + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); -TEST(CustomOperatorTest, ListParameters2) { - torch::RegisterOperators reg( - "foo::lists2(Tensor[] tensors) -> Tensor[]", - [](torch::List tensors) { return tensors; }); + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); + } + { + // Check that lists work well. + torch::RegisterOperators reg( + "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", + [](torch::List ints, + torch::List floats, + torch::List tensors) { return floats; }); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists"); + + ASSERT_EQ(op->schema().arguments().size(), 3); + ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); + ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); + ASSERT_TRUE( + op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); + ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); + + Stack stack; + push(stack, c10::List({1, 2})); + push(stack, c10::List({1.0, 2.0})); + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 2); + ASSERT_EQ(output.get(0), 1.0); + ASSERT_EQ(output.get(1), 2.0); + } + { + torch::RegisterOperators reg( + "foo::lists2(Tensor[] tensors) -> Tensor[]", + [](torch::List tensors) { return tensors; }); - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); - ASSERT_EQ(ops.size(), 1); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); + ASSERT_EQ(ops.size(), 1); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists2"); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists2"); - ASSERT_EQ(op->schema().arguments().size(), 1); - ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); + ASSERT_EQ(op->schema().arguments().size(), 1); + ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); - Stack stack; - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); + Stack stack; + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); - ASSERT_EQ(output.size(), 1); - ASSERT_TRUE(output.get(0).allclose(at::ones(5))); + ASSERT_EQ(output.size(), 1); + ASSERT_TRUE(output.get(0).allclose(at::ones(5))); + } } -TEST(CustomOperatorTest, Aliasing) { +void testCustomOperatorAliasing() { torch::RegisterOperators reg( "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor { a.add_(b); @@ -184,65 +182,77 @@ graph(%x: Tensor, %y: Tensor): } } -static constexpr char op_list[] = "foofoo::bar.template;foo::another"; +void testIValueKWargs() { + const auto text = R"( + def foo(a : int, b : int, c : int = 4): + return a + 2*b + 3*c + )"; + auto cu = compile(text); + auto result = cu->get_function("foo")({1}, {{"b", 3}}); + ASSERT_EQ(result.toInt(), 19); +} + +void testTemplatedOperatorCreator() { + constexpr char op_list[] = "foofoo::bar.template;foo::another"; #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n) \ torch::detail::SelectiveStr(n) -TEST(TestCustomOperator, OperatorGeneratorUndeclared) { - // Try to register an op name that does not exist in op_list. - // Expected: the op name is not registered. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); - ASSERT_EQ(ops.size(), 0); -} + { + // Try to register an op name that does not exist in op_list. + // Expected: the op name is not registered. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); + ASSERT_EQ(ops.size(), 0); + } -TEST(TestCustomOperator, OperatorGeneratorBasic) { - // The operator should be successfully registered since its name is in the - // whitelist. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foofoo::bar"); - - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); - - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); + { + // The operator should be successfully registered since its name is in the + // whitelist. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foofoo::bar"); + + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); + } } } // namespace jit diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 92baba1168da..953d1bf42fc0 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -2225,15 +2225,5 @@ void testProfilerDisableInCallback() { t.join(); } -void testIValueKWargs() { - const auto text = R"( - def foo(a : int, b : int, c : int = 4): - return a + 2*b + 3*c - )"; - auto cu = compile(text); - auto result = cu->get_function("foo")({1}, {{"b", 3}}); - ASSERT_EQ(result.toInt(), 19); -} - } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 8f43882c9e22..45d7f48b1f8a 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -9,14 +9,22 @@ namespace torch { namespace jit { #define TH_FORALL_TESTS(_) \ + _(ADFormulas) \ _(Attributes) \ _(Blocks) \ _(CallStack) \ _(CallStackCaching) \ + _(CodeTemplate) \ _(ControlFlow) \ + _(CreateAutodiffSubgraphs) \ + _(CustomOperators) \ + _(CustomOperatorAliasing) \ + _(TemplatedOperatorCreator) \ _(IValueKWargs) \ _(CustomFusion) \ _(SchemaMatching) \ + _(Differentiate) \ + _(DifferentiateWithRequiresGrad) \ _(FromQualString) \ _(InternedStrings) \ _(PassManagement) \ @@ -27,9 +35,12 @@ namespace jit { _(SubgraphUtils) \ _(SubgraphUtilsVmap) \ _(IRParser) \ + _(ConstantPooling) \ + _(CleanUpPasses) \ _(THNNConv) \ _(ATenNativeBatchNorm) \ _(NoneSchemaMatch) \ + _(ClassParser) \ _(UnifyTypes) \ _(Profiler) \ _(FallbackGraphs) \ @@ -50,11 +61,15 @@ namespace jit { _(ModuleDeepcopyAliasing) \ _(ModuleDefine) \ _(QualifiedName) \ + _(ClassImport) \ + _(ScriptObject) \ _(ExtraFilesHookPreference) \ _(SaveExtraFilesHook) \ _(TypeTags) \ _(DCE) \ _(CustomFusionNestedBlocks) \ + _(ClassDerive) \ + _(SaveLoadTorchbind) \ _(ModuleInterfaceSerialization) \ _(ModuleCloneWithModuleInterface) \ _(ClassTypeAddRemoveAttr) \ @@ -85,6 +100,7 @@ namespace jit { _(LiteInterpreterHierarchyModuleInfo) \ _(LiteInterpreterDuplicatedClassTypeModuleInfo) \ _(LiteInterpreterEval) \ + _(TorchbindIValueAPI) \ _(LiteInterpreterDict) \ _(LiteInterpreterFindAndRunMethod) \ _(LiteInterpreterFindWrongMethodName) \ From 27c7158166089db7329b9f0dea65da36e3785cda Mon Sep 17 00:00:00 2001 From: Bugra Akyildiz Date: Wed, 23 Sep 2020 17:55:24 -0700 Subject: [PATCH 071/449] Remove __future__ imports for legacy Python2 supports (#45033) Summary: There is a module called `2to3` which you can target for future specifically to remove these, the directory of `caffe2` has the most redundant imports: ```2to3 -f future -w caffe2``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/45033 Reviewed By: seemethere Differential Revision: D23808648 Pulled By: bugra fbshipit-source-id: 38971900f0fe43ab44a9168e57f2307580d36a38 --- caffe2/contrib/aten/aten_test.py | 8 ++++---- .../contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py | 2 +- caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_fusions.py | 2 +- caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py | 2 +- caffe2/contrib/fakelowp/test/test_int8_quant.py | 2 +- caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py | 8 ++++---- caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py | 2 +- caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py | 2 +- caffe2/contrib/gloo/gloo_test.py | 8 ++++---- caffe2/contrib/nccl/nccl_ops_test.py | 8 ++++---- caffe2/contrib/nnpack/nnpack_ops_test.py | 8 ++++---- caffe2/contrib/playground/AnyExp.py | 8 ++++---- caffe2/contrib/playground/AnyExpOnTerm.py | 8 ++++---- caffe2/contrib/playground/ModuleRegister.py | 8 ++++---- caffe2/contrib/playground/checkpoint.py | 8 ++++---- caffe2/contrib/playground/compute_loss.py | 8 ++++---- caffe2/contrib/playground/compute_topk_accuracy.py | 8 ++++---- caffe2/contrib/playground/meter.py | 8 ++++---- caffe2/contrib/playground/module_map.py | 8 ++++---- caffe2/contrib/playground/output_generator.py | 8 ++++---- caffe2/contrib/playground/resnetdemo/IN1k_resnet.py | 8 ++++---- .../playground/resnetdemo/IN1k_resnet_no_test_model.py | 8 ++++---- .../resnetdemo/caffe2_resnet50_default_forward.py | 8 ++++---- .../resnetdemo/caffe2_resnet50_default_param_update.py | 8 ++++---- .../playground/resnetdemo/explicit_resnet_forward.py | 8 ++++---- .../playground/resnetdemo/explicit_resnet_param_update.py | 8 ++++---- caffe2/contrib/playground/resnetdemo/gfs_IN1k.py | 8 ++++---- .../resnetdemo/override_no_test_model_no_checkpoint.py | 8 ++++---- .../contrib/playground/resnetdemo/rendezvous_filestore.py | 8 ++++---- caffe2/contrib/prof/cuda_profile_ops_test.py | 8 ++++---- caffe2/contrib/tensorboard/tensorboard.py | 8 ++++---- caffe2/contrib/tensorboard/tensorboard_exporter.py | 8 ++++---- caffe2/contrib/tensorboard/tensorboard_exporter_test.py | 8 ++++---- caffe2/contrib/tensorboard/tensorboard_test.py | 8 ++++---- caffe2/contrib/warpctc/ctc_ops_test.py | 6 +++--- caffe2/core/nomnigraph/op_gen.py | 8 ++++---- caffe2/distributed/file_store_handler_op_test.py | 8 ++++---- caffe2/distributed/redis_store_handler_op_test.py | 8 ++++---- caffe2/distributed/store_ops_test_util.py | 8 ++++---- caffe2/experiments/python/SparseTransformer.py | 8 ++++---- caffe2/experiments/python/convnet_benchmarks.py | 8 ++++---- caffe2/experiments/python/device_reduce_sum_bench.py | 8 ++++---- caffe2/experiments/python/funhash_op_test.py | 8 ++++---- caffe2/experiments/python/net_construct_bench.py | 8 ++++---- caffe2/experiments/python/sparse_funhash_op_test.py | 8 ++++---- caffe2/experiments/python/sparse_reshape_op_test.py | 8 ++++---- caffe2/experiments/python/tt_contraction_op_test.py | 8 ++++---- caffe2/experiments/python/tt_pad_op_test.py | 8 ++++---- caffe2/perfkernels/hp_emblookup_codegen.py | 2 +- caffe2/python/__init__.py | 2 +- caffe2/python/allcompare_test.py | 8 ++++---- caffe2/python/attention.py | 8 ++++---- caffe2/python/benchmark_generator.py | 8 ++++---- .../benchmarks/fused_rowwise_nbit_conversion_bench.py | 2 +- .../benchmarks/sparse_lengths_sum_nbit_benchmark.py | 2 +- caffe2/python/binarysize.py | 8 ++++---- caffe2/python/brew.py | 8 ++++---- caffe2/python/brew_test.py | 8 ++++---- caffe2/python/build.py | 8 ++++---- caffe2/python/cached_reader.py | 8 ++++---- caffe2/python/checkpoint.py | 8 ++++---- caffe2/python/checkpoint_test.py | 8 ++++---- caffe2/python/cnn.py | 8 ++++---- caffe2/python/context.py | 8 ++++---- caffe2/python/context_test.py | 8 ++++---- caffe2/python/control.py | 8 ++++---- caffe2/python/control_ops_grad.py | 8 ++++---- caffe2/python/control_ops_grad_test.py | 8 ++++---- caffe2/python/control_ops_util.py | 8 ++++---- caffe2/python/control_test.py | 8 ++++---- caffe2/python/convert.py | 8 ++++---- caffe2/python/convert_test.py | 8 ++++---- caffe2/python/core.py | 8 ++++---- caffe2/python/core_gradients_test.py | 8 ++++---- caffe2/python/core_test.py | 8 ++++---- caffe2/python/crf.py | 2 +- caffe2/python/crf_predict.py | 2 +- caffe2/python/crf_viterbi_test.py | 8 ++++---- caffe2/python/data_parallel_model.py | 6 +++--- caffe2/python/data_parallel_model_test.py | 6 +++--- caffe2/python/data_workers.py | 8 ++++---- caffe2/python/data_workers_test.py | 8 ++++---- caffe2/python/dataio.py | 8 ++++---- caffe2/python/dataio_test.py | 8 ++++---- caffe2/python/dataset.py | 8 ++++---- caffe2/python/db_file_reader.py | 8 ++++---- caffe2/python/db_test.py | 8 ++++---- caffe2/python/docs/formatter.py | 8 ++++---- caffe2/python/docs/generator.py | 8 ++++---- caffe2/python/docs/github.py | 8 ++++---- caffe2/python/docs/parser.py | 8 ++++---- caffe2/python/dyndep.py | 8 ++++---- caffe2/python/embedding_generation_benchmark.py | 8 ++++---- caffe2/python/examples/char_rnn.py | 8 ++++---- caffe2/python/examples/lmdb_create_example.py | 8 ++++---- caffe2/python/experiment_util.py | 8 ++++---- caffe2/python/extension_loader.py | 8 ++++---- caffe2/python/fakefp16_transform_lib.py | 6 +++--- caffe2/python/fakelowp/init_shared_libs.py | 2 +- caffe2/python/fakelowp/test_utils.py | 8 ++++---- caffe2/python/filler_test.py | 6 +++--- caffe2/python/functional.py | 8 ++++---- caffe2/python/functional_test.py | 8 ++++---- caffe2/python/fused_8bit_rowwise_conversion_ops_test.py | 8 ++++---- caffe2/python/gradient_check_test.py | 8 ++++---- caffe2/python/gradient_checker.py | 8 ++++---- caffe2/python/gru_cell.py | 8 ++++---- caffe2/python/helpers/algebra.py | 8 ++++---- caffe2/python/helpers/arg_scope.py | 6 +++--- caffe2/python/helpers/array_helpers.py | 8 ++++---- caffe2/python/helpers/control_ops.py | 8 ++++---- caffe2/python/helpers/conv.py | 8 ++++---- caffe2/python/helpers/db_input.py | 8 ++++---- caffe2/python/helpers/dropout.py | 8 ++++---- caffe2/python/helpers/elementwise_linear.py | 8 ++++---- caffe2/python/helpers/fc.py | 8 ++++---- caffe2/python/helpers/nonlinearity.py | 8 ++++---- caffe2/python/helpers/normalization.py | 8 ++++---- caffe2/python/helpers/pooling.py | 8 ++++---- caffe2/python/helpers/tools.py | 8 ++++---- caffe2/python/helpers/train.py | 8 ++++---- caffe2/python/hip_test_util.py | 8 ++++---- caffe2/python/hsm_util.py | 8 ++++---- caffe2/python/hypothesis_test.py | 6 +++--- caffe2/python/hypothesis_test_util.py | 8 ++++---- caffe2/python/ideep/LRN_op_test.py | 8 ++++---- caffe2/python/ideep/adam_op_test.py | 8 ++++---- caffe2/python/ideep/blobs_queue_db_test.py | 8 ++++---- caffe2/python/ideep/channel_shuffle_op_test.py | 8 ++++---- caffe2/python/ideep/concat_split_op_test.py | 8 ++++---- caffe2/python/ideep/conv_op_test.py | 8 ++++---- caffe2/python/ideep/conv_transpose_test.py | 6 +++--- caffe2/python/ideep/convfusion_op_test.py | 8 ++++---- caffe2/python/ideep/copy_op_test.py | 8 ++++---- caffe2/python/ideep/dropout_op_test.py | 8 ++++---- caffe2/python/ideep/elementwise_sum_op_test.py | 8 ++++---- caffe2/python/ideep/expanddims_squeeze_op_test.py | 8 ++++---- caffe2/python/ideep/fc_op_test.py | 8 ++++---- caffe2/python/ideep/leaky_relu_op_test.py | 8 ++++---- caffe2/python/ideep/moment_sgd_op_test.py | 8 ++++---- caffe2/python/ideep/operator_fallback_op_test.py | 8 ++++---- caffe2/python/ideep/order_switch_op_test.py | 8 ++++---- caffe2/python/ideep/pool_op_test.py | 8 ++++---- caffe2/python/ideep/pre_convert_test.py | 8 ++++---- caffe2/python/ideep/relu_op_test.py | 8 ++++---- caffe2/python/ideep/reshape_op_test.py | 8 ++++---- caffe2/python/ideep/shape_op_test.py | 8 ++++---- caffe2/python/ideep/sigmoid_op_test.py | 8 ++++---- caffe2/python/ideep/softmax_op_test.py | 8 ++++---- caffe2/python/ideep/spatial_bn_op_test.py | 8 ++++---- caffe2/python/ideep/test_ideep_net.py | 8 ++++---- caffe2/python/ideep/transform_ideep_net.py | 8 ++++---- caffe2/python/ideep/transpose_op_test.py | 8 ++++---- caffe2/python/ideep/weightedsum_op_test.py | 8 ++++---- caffe2/python/ideep_test_util.py | 8 ++++---- caffe2/python/layer_model_helper.py | 8 ++++---- caffe2/python/layer_model_instantiator.py | 8 ++++---- caffe2/python/layer_parameter_sharing_test.py | 8 ++++---- caffe2/python/layer_test_util.py | 8 ++++---- caffe2/python/layers/__init__.py | 8 ++++---- caffe2/python/layers/adaptive_weight.py | 2 +- caffe2/python/layers/add_bias.py | 8 ++++---- caffe2/python/layers/arc_cosine_feature_map.py | 8 ++++---- caffe2/python/layers/batch_huber_loss.py | 8 ++++---- caffe2/python/layers/batch_lr_loss.py | 8 ++++---- caffe2/python/layers/batch_mse_loss.py | 8 ++++---- caffe2/python/layers/batch_normalization.py | 8 ++++---- caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py | 8 ++++---- caffe2/python/layers/batch_softmax_loss.py | 8 ++++---- caffe2/python/layers/blob_weighted_sum.py | 8 ++++---- caffe2/python/layers/bpr_loss.py | 8 ++++---- caffe2/python/layers/bucket_weighted.py | 8 ++++---- caffe2/python/layers/build_index.py | 8 ++++---- caffe2/python/layers/concat.py | 8 ++++---- caffe2/python/layers/constant_weight.py | 8 ++++---- caffe2/python/layers/conv.py | 8 ++++---- caffe2/python/layers/dropout.py | 8 ++++---- caffe2/python/layers/fc.py | 8 ++++---- caffe2/python/layers/fc_with_bootstrap.py | 2 +- caffe2/python/layers/fc_without_bias.py | 8 ++++---- caffe2/python/layers/feature_sparse_to_dense.py | 2 +- caffe2/python/layers/functional.py | 8 ++++---- caffe2/python/layers/gather_record.py | 8 ++++---- caffe2/python/layers/homotopy_weight.py | 8 ++++---- caffe2/python/layers/label_smooth.py | 8 ++++---- caffe2/python/layers/last_n_window_collector.py | 8 ++++---- caffe2/python/layers/layer_normalization.py | 8 ++++---- caffe2/python/layers/layers.py | 2 +- caffe2/python/layers/margin_rank_loss.py | 8 ++++---- caffe2/python/layers/merge_id_lists.py | 8 ++++---- caffe2/python/layers/pairwise_similarity.py | 8 ++++---- caffe2/python/layers/position_weighted.py | 8 ++++---- caffe2/python/layers/random_fourier_features.py | 8 ++++---- caffe2/python/layers/reservoir_sampling.py | 8 ++++---- caffe2/python/layers/sampling_train.py | 8 ++++---- caffe2/python/layers/sampling_trainable_mixin.py | 8 ++++---- caffe2/python/layers/select_record_by_context.py | 8 ++++---- caffe2/python/layers/semi_random_features.py | 8 ++++---- caffe2/python/layers/sparse_dropout_with_replacement.py | 8 ++++---- caffe2/python/layers/sparse_feature_hash.py | 8 ++++---- caffe2/python/layers/sparse_lookup.py | 8 ++++---- caffe2/python/layers/split.py | 8 ++++---- caffe2/python/layers/tags.py | 8 ++++---- caffe2/python/layers/uniform_sampling.py | 8 ++++---- caffe2/python/layers_test.py | 8 ++++---- caffe2/python/lazy_dyndep.py | 8 ++++---- caffe2/python/lazy_dyndep_test.py | 8 ++++---- .../python/lengths_reducer_fused_8bit_rowwise_ops_test.py | 2 +- caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py | 8 ++++---- caffe2/python/lstm_benchmark.py | 8 ++++---- caffe2/python/memonger.py | 8 ++++---- caffe2/python/memonger_test.py | 8 ++++---- caffe2/python/mkl/mkl_LRN_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_LRN_speed_test.py | 8 ++++---- caffe2/python/mkl/mkl_concat_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_conv_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_copy_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_elementwise_add_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_elementwise_sum_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_fc_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_fc_speed_test.py | 8 ++++---- caffe2/python/mkl/mkl_fill_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_pool_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_pool_speed_test.py | 8 ++++---- caffe2/python/mkl/mkl_relu_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_sbn_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_sbn_speed_test.py | 8 ++++---- caffe2/python/mkl/mkl_sigmoid_op_test.py | 8 ++++---- caffe2/python/mkl/mkl_speed_test.py | 8 ++++---- caffe2/python/mkl/mkl_squeeze_op_test.py | 8 ++++---- caffe2/python/mkl/rewrite_graph.py | 8 ++++---- caffe2/python/mkl/rewrite_graph_test.py | 8 ++++---- caffe2/python/mkl_test_util.py | 8 ++++---- caffe2/python/model_helper.py | 8 ++++---- caffe2/python/model_helper_test.py | 2 +- caffe2/python/modeling/compute_histogram_for_blobs.py | 8 ++++---- .../python/modeling/compute_histogram_for_blobs_test.py | 8 ++++---- caffe2/python/modeling/compute_norm_for_blobs.py | 8 ++++---- caffe2/python/modeling/compute_norm_for_blobs_test.py | 8 ++++---- caffe2/python/modeling/compute_statistics_for_blobs.py | 8 ++++---- .../python/modeling/compute_statistics_for_blobs_test.py | 8 ++++---- caffe2/python/modeling/get_entry_from_blobs.py | 8 ++++---- caffe2/python/modeling/get_entry_from_blobs_test.py | 8 ++++---- caffe2/python/modeling/gradient_clipping.py | 8 ++++---- caffe2/python/modeling/gradient_clipping_test.py | 8 ++++---- caffe2/python/modeling/initializers.py | 8 ++++---- caffe2/python/modeling/initializers_test.py | 8 ++++---- caffe2/python/modeling/net_modifier.py | 8 ++++---- caffe2/python/modeling/parameter_info.py | 8 ++++---- caffe2/python/modeling/parameter_sharing.py | 8 ++++---- caffe2/python/modeling/parameter_sharing_test.py | 8 ++++---- caffe2/python/models/__sym_init__.py | 8 ++++---- caffe2/python/models/download.py | 8 ++++---- caffe2/python/models/imagenet_trainer_test_utils.py | 8 ++++---- caffe2/python/models/resnet.py | 6 +++--- caffe2/python/models/resnet_test.py | 8 ++++---- caffe2/python/models/seq2seq/beam_search.py | 8 ++++---- caffe2/python/models/seq2seq/seq2seq_beam_search_test.py | 8 ++++---- caffe2/python/models/seq2seq/seq2seq_model_helper.py | 8 ++++---- caffe2/python/models/seq2seq/seq2seq_model_helper_test.py | 8 ++++---- caffe2/python/models/seq2seq/seq2seq_util.py | 8 ++++---- caffe2/python/models/seq2seq/train.py | 8 ++++---- caffe2/python/models/seq2seq/translate.py | 8 ++++---- caffe2/python/models/shufflenet.py | 8 ++++---- caffe2/python/models/shufflenet_test.py | 8 ++++---- caffe2/python/modifier_context.py | 8 ++++---- caffe2/python/net_builder.py | 8 ++++---- caffe2/python/net_builder_test.py | 8 ++++---- caffe2/python/net_drawer.py | 8 ++++---- caffe2/python/net_printer.py | 8 ++++---- caffe2/python/net_printer_test.py | 8 ++++---- caffe2/python/nomnigraph.py | 2 +- caffe2/python/nomnigraph_test.py | 8 ++++---- caffe2/python/nomnigraph_transformations.py | 2 +- caffe2/python/nomnigraph_transformations_test.py | 8 ++++---- caffe2/python/normalizer.py | 2 +- caffe2/python/normalizer_context.py | 8 ++++---- caffe2/python/normalizer_test.py | 6 +++--- caffe2/python/numa_benchmark.py | 6 +++--- caffe2/python/numa_test.py | 6 +++--- caffe2/python/observer_test.py | 8 ++++---- caffe2/python/onnx/backend.py | 8 ++++---- caffe2/python/onnx/backend_cpp_rep.py | 8 ++++---- caffe2/python/onnx/backend_rep.py | 8 ++++---- caffe2/python/onnx/bin/conversion.py | 6 +++--- caffe2/python/onnx/error.py | 8 ++++---- caffe2/python/onnx/frontend.py | 8 ++++---- caffe2/python/onnx/helper.py | 8 ++++---- caffe2/python/onnx/onnxifi.py | 8 ++++---- caffe2/python/onnx/test_onnxifi.py | 8 ++++---- caffe2/python/onnx/tests/__init__.py | 8 ++++---- caffe2/python/onnx/tests/c2_ref_test.py | 8 ++++---- caffe2/python/onnx/tests/conversion_test.py | 6 +++--- caffe2/python/onnx/tests/helper_test.py | 8 ++++---- caffe2/python/onnx/tests/onnx_backend_test.py | 8 ++++---- caffe2/python/onnx/tests/ssa_test.py | 8 ++++---- caffe2/python/onnx/tests/test_utils.py | 8 ++++---- caffe2/python/onnx/workspace.py | 8 ++++---- caffe2/python/operator_fp_exceptions_test.py | 6 +++--- caffe2/python/operator_test/activation_ops_test.py | 8 ++++---- caffe2/python/operator_test/adadelta_test.py | 8 ++++---- caffe2/python/operator_test/adagrad_test.py | 2 +- caffe2/python/operator_test/adagrad_test_helper.py | 2 +- caffe2/python/operator_test/adam_test.py | 8 ++++---- caffe2/python/operator_test/affine_channel_op_test.py | 6 +++--- caffe2/python/operator_test/apmeter_test.py | 8 ++++---- caffe2/python/operator_test/arg_ops_test.py | 8 ++++---- caffe2/python/operator_test/assert_test.py | 6 +++--- caffe2/python/operator_test/atomic_ops_test.py | 8 ++++---- caffe2/python/operator_test/basic_rnn_test.py | 8 ++++---- caffe2/python/operator_test/batch_box_cox_test.py | 8 ++++---- caffe2/python/operator_test/batch_bucketize_op_test.py | 8 ++++---- caffe2/python/operator_test/batch_moments_op_test.py | 6 +++--- .../python/operator_test/batch_sparse_to_dense_op_test.py | 8 ++++---- caffe2/python/operator_test/bbox_transform_test.py | 8 ++++---- caffe2/python/operator_test/bisect_percentile_op_test.py | 8 ++++---- caffe2/python/operator_test/blobs_queue_db_test.py | 8 ++++---- caffe2/python/operator_test/boolean_mask_test.py | 6 +++--- caffe2/python/operator_test/boolean_unmask_test.py | 8 ++++---- caffe2/python/operator_test/box_with_nms_limit_op_test.py | 8 ++++---- caffe2/python/operator_test/bucketize_op_test.py | 8 ++++---- caffe2/python/operator_test/cast_op_test.py | 8 ++++---- caffe2/python/operator_test/ceil_op_test.py | 8 ++++---- .../operator_test/channel_backprop_stats_op_test.py | 8 ++++---- caffe2/python/operator_test/channel_shuffle_test.py | 2 +- caffe2/python/operator_test/channel_stats_op_test.py | 6 +++--- caffe2/python/operator_test/checkpoint_test.py | 8 ++++---- caffe2/python/operator_test/clip_op_test.py | 8 ++++---- caffe2/python/operator_test/clip_tensor_op_test.py | 8 ++++---- .../collect_and_distribute_fpn_rpn_proposals_op_test.py | 8 ++++---- caffe2/python/operator_test/concat_split_op_test.py | 8 ++++---- caffe2/python/operator_test/conditional_test.py | 6 +++--- caffe2/python/operator_test/conftest.py | 8 ++++---- caffe2/python/operator_test/conv_test.py | 2 +- caffe2/python/operator_test/conv_transpose_test.py | 6 +++--- caffe2/python/operator_test/copy_ops_test.py | 8 ++++---- .../python/operator_test/copy_rows_to_tensor_op_test.py | 2 +- .../operator_test/cosine_embedding_criterion_op_test.py | 8 ++++---- caffe2/python/operator_test/counter_ops_test.py | 8 ++++---- caffe2/python/operator_test/crf_test.py | 8 ++++---- caffe2/python/operator_test/cross_entropy_ops_test.py | 8 ++++---- .../operator_test/ctc_beam_search_decoder_op_test.py | 8 ++++---- caffe2/python/operator_test/ctc_greedy_decoder_op_test.py | 8 ++++---- caffe2/python/operator_test/cudnn_recurrent_test.py | 8 ++++---- caffe2/python/operator_test/data_couple_op_test.py | 8 ++++---- caffe2/python/operator_test/dataset_ops_test.py | 8 ++++---- caffe2/python/operator_test/deform_conv_test.py | 2 +- .../operator_test/dense_vector_to_id_list_op_test.py | 8 ++++---- caffe2/python/operator_test/depthwise_3x3_conv_test.py | 8 ++++---- caffe2/python/operator_test/detectron_keypoints.py | 8 ++++---- caffe2/python/operator_test/distance_op_test.py | 8 ++++---- caffe2/python/operator_test/dropout_op_test.py | 8 ++++---- caffe2/python/operator_test/duplicate_operands_test.py | 8 ++++---- caffe2/python/operator_test/elementwise_linear_op_test.py | 8 ++++---- .../python/operator_test/elementwise_logical_ops_test.py | 8 ++++---- .../python/operator_test/elementwise_op_broadcast_test.py | 8 ++++---- caffe2/python/operator_test/elementwise_ops_test.py | 8 ++++---- caffe2/python/operator_test/emptysample_ops_test.py | 8 ++++---- caffe2/python/operator_test/enforce_finite_op_test.py | 8 ++++---- caffe2/python/operator_test/ensure_clipped_test.py | 2 +- caffe2/python/operator_test/ensure_cpu_output_op_test.py | 8 ++++---- caffe2/python/operator_test/erf_op_test.py | 8 ++++---- caffe2/python/operator_test/expand_op_test.py | 8 ++++---- caffe2/python/operator_test/fc_operator_test.py | 8 ++++---- caffe2/python/operator_test/feature_maps_ops_test.py | 8 ++++---- caffe2/python/operator_test/filler_ops_test.py | 8 ++++---- caffe2/python/operator_test/find_op_test.py | 8 ++++---- caffe2/python/operator_test/flatten_op_test.py | 8 ++++---- caffe2/python/operator_test/flexible_top_k_test.py | 8 ++++---- caffe2/python/operator_test/floor_op_test.py | 8 ++++---- .../fused_nbit_rowwise_conversion_ops_test.py | 2 +- .../operator_test/fused_nbit_rowwise_test_helper.py | 2 +- caffe2/python/operator_test/gather_ops_test.py | 8 ++++---- caffe2/python/operator_test/gather_ranges_op_test.py | 2 +- .../given_tensor_byte_string_to_uint8_fill_op_test.py | 8 ++++---- caffe2/python/operator_test/given_tensor_fill_op_test.py | 8 ++++---- caffe2/python/operator_test/glu_op_test.py | 8 ++++---- caffe2/python/operator_test/group_conv_test.py | 6 +++--- caffe2/python/operator_test/group_norm_op_test.py | 6 +++--- caffe2/python/operator_test/gru_test.py | 8 ++++---- .../python/operator_test/heatmap_max_keypoint_op_test.py | 8 ++++---- caffe2/python/operator_test/hsm_test.py | 8 ++++---- caffe2/python/operator_test/hyperbolic_ops_test.py | 8 ++++---- caffe2/python/operator_test/im2col_col2im_test.py | 8 ++++---- caffe2/python/operator_test/image_input_op_test.py | 8 ++++---- caffe2/python/operator_test/index_hash_ops_test.py | 8 ++++---- caffe2/python/operator_test/index_ops_test.py | 8 ++++---- caffe2/python/operator_test/instance_norm_test.py | 6 +++--- caffe2/python/operator_test/integral_image_ops_test.py | 8 ++++---- caffe2/python/operator_test/jsd_ops_test.py | 8 ++++---- caffe2/python/operator_test/key_split_ops_test.py | 8 ++++---- caffe2/python/operator_test/lars_test.py | 8 ++++---- caffe2/python/operator_test/layer_norm_op_test.py | 8 ++++---- caffe2/python/operator_test/leaky_relu_test.py | 6 +++--- .../operator_test/learning_rate_adaption_op_test.py | 8 ++++---- caffe2/python/operator_test/learning_rate_op_test.py | 8 ++++---- caffe2/python/operator_test/length_split_op_test.py | 8 ++++---- caffe2/python/operator_test/lengths_pad_op_test.py | 8 ++++---- .../lengths_reducer_fused_nbit_rowwise_ops_test.py | 2 +- caffe2/python/operator_test/lengths_tile_op_test.py | 8 ++++---- caffe2/python/operator_test/lengths_top_k_ops_test.py | 8 ++++---- caffe2/python/operator_test/listwise_l2r_operator_test.py | 2 +- caffe2/python/operator_test/load_save_test.py | 8 ++++---- caffe2/python/operator_test/locally_connected_op_test.py | 6 +++--- caffe2/python/operator_test/loss_ops_test.py | 8 ++++---- caffe2/python/operator_test/lpnorm_op_test.py | 8 ++++---- caffe2/python/operator_test/map_ops_test.py | 8 ++++---- .../operator_test/margin_ranking_criterion_op_test.py | 8 ++++---- caffe2/python/operator_test/math_ops_test.py | 8 ++++---- caffe2/python/operator_test/matmul_op_test.py | 8 ++++---- caffe2/python/operator_test/mean_op_test.py | 8 ++++---- caffe2/python/operator_test/merge_id_lists_op_test.py | 8 ++++---- caffe2/python/operator_test/mkl_conv_op_test.py | 8 ++++---- caffe2/python/operator_test/mkl_packed_fc_op_test.py | 8 ++++---- caffe2/python/operator_test/mod_op_test.py | 8 ++++---- caffe2/python/operator_test/moments_op_test.py | 8 ++++---- caffe2/python/operator_test/momentum_sgd_test.py | 8 ++++---- caffe2/python/operator_test/mpi_test.py | 8 ++++---- caffe2/python/operator_test/mul_gradient_benchmark.py | 8 ++++---- caffe2/python/operator_test/negate_gradient_op_test.py | 8 ++++---- caffe2/python/operator_test/ngram_ops_test.py | 8 ++++---- caffe2/python/operator_test/normalize_op_test.py | 6 +++--- caffe2/python/operator_test/numpy_tile_op_test.py | 8 ++++---- caffe2/python/operator_test/one_hot_ops_test.py | 8 ++++---- caffe2/python/operator_test/onnx_while_test.py | 6 +++--- caffe2/python/operator_test/order_switch_test.py | 2 +- caffe2/python/operator_test/pack_ops_test.py | 8 ++++---- caffe2/python/operator_test/pack_rnn_sequence_op_test.py | 8 ++++---- caffe2/python/operator_test/pad_test.py | 6 +++--- caffe2/python/operator_test/partition_ops_test.py | 8 ++++---- caffe2/python/operator_test/percentile_op_test.py | 8 ++++---- .../operator_test/piecewise_linear_transform_test.py | 8 ++++---- caffe2/python/operator_test/pooling_test.py | 6 +++--- caffe2/python/operator_test/prepend_dim_test.py | 8 ++++---- caffe2/python/operator_test/python_op_test.py | 8 ++++---- caffe2/python/operator_test/quantile_test.py | 2 +- .../operator_test/rand_quantization_op_speed_test.py | 2 +- caffe2/python/operator_test/rand_quantization_op_test.py | 8 ++++---- caffe2/python/operator_test/rank_loss_operator_test.py | 8 ++++---- caffe2/python/operator_test/rebatching_queue_test.py | 8 ++++---- caffe2/python/operator_test/record_queue_test.py | 8 ++++---- .../python/operator_test/recurrent_net_executor_test.py | 8 ++++---- caffe2/python/operator_test/recurrent_network_test.py | 8 ++++---- caffe2/python/operator_test/reduce_ops_test.py | 8 ++++---- caffe2/python/operator_test/reduction_ops_test.py | 8 ++++---- caffe2/python/operator_test/reshape_ops_test.py | 8 ++++---- caffe2/python/operator_test/resize_op_test.py | 6 +++--- caffe2/python/operator_test/rmac_regions_op_test.py | 8 ++++---- caffe2/python/operator_test/rms_norm_op_test.py | 2 +- caffe2/python/operator_test/rnn_cell_test.py | 8 ++++---- caffe2/python/operator_test/roi_align_rotated_op_test.py | 8 ++++---- caffe2/python/operator_test/rowwise_counter_test.py | 2 +- caffe2/python/operator_test/scale_op_test.py | 8 ++++---- caffe2/python/operator_test/segment_ops_test.py | 8 ++++---- caffe2/python/operator_test/selu_op_test.py | 8 ++++---- caffe2/python/operator_test/sequence_ops_test.py | 8 ++++---- caffe2/python/operator_test/shape_inference_test.py | 8 ++++---- .../operator_test/sinusoid_position_encoding_op_test.py | 8 ++++---- caffe2/python/operator_test/softmax_ops_test.py | 8 ++++---- caffe2/python/operator_test/softplus_op_test.py | 8 ++++---- .../sparse_dropout_with_replacement_op_test.py | 8 ++++---- .../python/operator_test/sparse_gradient_checker_test.py | 8 ++++---- .../python/operator_test/sparse_lengths_sum_benchmark.py | 2 +- caffe2/python/operator_test/sparse_lp_regularizer_test.py | 8 ++++---- caffe2/python/operator_test/sparse_normalize_test.py | 8 ++++---- caffe2/python/operator_test/sparse_ops_test.py | 8 ++++---- .../python/operator_test/sparse_to_dense_mask_op_test.py | 8 ++++---- caffe2/python/operator_test/spatial_bn_op_test.py | 8 ++++---- .../python/operator_test/specialized_segment_ops_test.py | 2 +- caffe2/python/operator_test/square_root_divide_op_test.py | 8 ++++---- caffe2/python/operator_test/stats_ops_test.py | 8 ++++---- caffe2/python/operator_test/stats_put_ops_test.py | 8 ++++---- caffe2/python/operator_test/storm_test.py | 8 ++++---- caffe2/python/operator_test/string_ops_test.py | 8 ++++---- caffe2/python/operator_test/text_file_reader_test.py | 8 ++++---- caffe2/python/operator_test/thresholded_relu_op_test.py | 8 ++++---- caffe2/python/operator_test/tile_op_test.py | 8 ++++---- caffe2/python/operator_test/top_k_test.py | 8 ++++---- caffe2/python/operator_test/torch_integration_test.py | 2 +- caffe2/python/operator_test/transpose_op_test.py | 6 +++--- caffe2/python/operator_test/trigonometric_op_test.py | 8 ++++---- caffe2/python/operator_test/unique_ops_test.py | 8 ++++---- .../python/operator_test/unique_uniform_fill_op_test.py | 8 ++++---- caffe2/python/operator_test/upsample_op_test.py | 6 +++--- caffe2/python/operator_test/utility_ops_test.py | 8 ++++---- caffe2/python/operator_test/video_input_op_test.py | 2 +- caffe2/python/operator_test/weight_scale_test.py | 8 ++++---- caffe2/python/operator_test/weighted_multi_sample_test.py | 8 ++++---- caffe2/python/operator_test/weighted_sample_test.py | 8 ++++---- caffe2/python/operator_test/weighted_sum_test.py | 8 ++++---- caffe2/python/operator_test/wngrad_test.py | 8 ++++---- caffe2/python/optimizer.py | 2 +- caffe2/python/optimizer_context.py | 8 ++++---- caffe2/python/optimizer_test.py | 6 +++--- caffe2/python/optimizer_test_util.py | 8 ++++---- caffe2/python/parallel_workers.py | 8 ++++---- caffe2/python/parallel_workers_test.py | 8 ++++---- caffe2/python/parallelize_bmuf_distributed_test.py | 6 +++--- caffe2/python/pipeline.py | 8 ++++---- caffe2/python/pipeline_test.py | 8 ++++---- caffe2/python/predictor/mobile_exporter.py | 8 ++++---- caffe2/python/predictor/mobile_exporter_test.py | 8 ++++---- caffe2/python/predictor/predictor_exporter.py | 8 ++++---- caffe2/python/predictor/predictor_exporter_test.py | 8 ++++---- caffe2/python/predictor/predictor_py_utils.py | 8 ++++---- caffe2/python/predictor/predictor_test.py | 8 ++++---- caffe2/python/predictor/serde.py | 8 ++++---- caffe2/python/predictor_constants.py | 8 ++++---- caffe2/python/python_op_test.py | 8 ++++---- caffe2/python/queue_util.py | 8 ++++---- caffe2/python/record_queue.py | 8 ++++---- caffe2/python/recurrent.py | 8 ++++---- caffe2/python/regularizer.py | 2 +- caffe2/python/regularizer_context.py | 8 ++++---- caffe2/python/regularizer_test.py | 2 +- caffe2/python/rnn/__init__.py | 8 ++++---- caffe2/python/rnn/lstm_comparison.py | 8 ++++---- caffe2/python/rnn/rnn_cell_test_util.py | 8 ++++---- caffe2/python/rnn_cell.py | 8 ++++---- caffe2/python/schema.py | 8 ++++---- caffe2/python/schema_test.py | 8 ++++---- caffe2/python/scope.py | 8 ++++---- caffe2/python/scope_test.py | 8 ++++---- caffe2/python/serialized_test/coverage.py | 8 ++++---- caffe2/python/serialized_test/serialized_test_util.py | 8 ++++---- caffe2/python/session.py | 8 ++++---- caffe2/python/session_test.py | 8 ++++---- caffe2/python/sparse_to_dense_mask_test.py | 8 ++++---- caffe2/python/sparse_to_dense_test.py | 8 ++++---- caffe2/python/task.py | 8 ++++---- caffe2/python/task_test.py | 8 ++++---- caffe2/python/test/blob_deallocation_test.py | 6 +++--- caffe2/python/test/do_op_test.py | 6 +++--- caffe2/python/test/executor_test.py | 6 +++--- caffe2/python/test/executor_test_util.py | 6 +++--- caffe2/python/test/fakefp16_transform_test.py | 6 +++--- caffe2/python/test/gpu_context_test.py | 8 ++++---- caffe2/python/test/python_protobuf_test.py | 6 +++--- caffe2/python/test_util.py | 8 ++++---- caffe2/python/text_file_reader.py | 8 ++++---- caffe2/python/timeout_guard.py | 8 ++++---- caffe2/python/transformations.py | 8 ++++---- caffe2/python/transformations_test.py | 8 ++++---- caffe2/python/trt/test_trt.py | 8 ++++---- caffe2/python/trt/transform.py | 8 ++++---- caffe2/python/tt_core.py | 6 +++--- caffe2/python/tt_core_test.py | 8 ++++---- caffe2/python/utils.py | 8 ++++---- caffe2/python/utils_test.py | 8 ++++---- caffe2/python/workspace.py | 8 ++++---- caffe2/python/workspace_test.py | 8 ++++---- .../quantization/server/batch_matmul_dnnlowp_op_test.py | 2 +- .../server/batch_permutation_dnnlowp_op_test.py | 2 +- .../server/channel_shuffle_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/concat_dnnlowp_op_test.py | 2 +- .../quantization/server/conv_depthwise_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py | 2 +- caffe2/quantization/server/conv_dnnlowp_op_test.py | 2 +- .../server/conv_groupwise_dnnlowp_acc16_op_test.py | 2 +- .../quantization/server/conv_groupwise_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/dequantize_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/dnnlowp_test_utils.py | 2 +- .../server/elementwise_add_dnnlowp_op_test.py | 2 +- .../server/elementwise_linear_dnnlowp_op_test.py | 2 +- .../server/elementwise_mul_dnnlowp_op_test.py | 2 +- .../server/elementwise_sum_dnnlowp_op_test.py | 2 +- .../server/fully_connected_dnnlowp_acc16_op_test.py | 2 +- .../server/fully_connected_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/fully_connected_fp16_test.py | 2 +- .../server/fully_connected_rowwise_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/gather_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/group_norm_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/int8_gen_quant_params_test.py | 2 +- .../server/int8_quant_scheme_blob_fill_test.py | 2 +- caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/observer_test.py | 2 +- caffe2/quantization/server/pool_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/quantize_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/relu_dnnlowp_op_test.py | 2 +- .../server/resize_nearest_3d_dnnlowp_op_test.py | 2 +- .../quantization/server/resize_nearest_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/sigmoid_dnnlowp_op_test.py | 2 +- .../server/spatial_batch_norm_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/tanh_dnnlowp_op_test.py | 2 +- caffe2/quantization/server/utils.py | 2 +- scripts/get_python_cmake_flags.py | 6 +++--- setup.py | 2 +- tools/amd_build/build_amd.py | 2 +- tools/autograd/gen_variable_type.py | 2 +- tools/clang_tidy.py | 2 +- tools/pyi/gen_pyi.py | 2 +- tools/setup_helpers/cmake.py | 2 +- 597 files changed, 2086 insertions(+), 2086 deletions(-) diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py index 92448fe355de..d9d99a1c1ae9 100644 --- a/caffe2/contrib/aten/aten_test.py +++ b/caffe2/contrib/aten/aten_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dyndep from hypothesis import given diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py index a8979ca63aa6..94a76fed85f5 100644 --- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py index 1a4f57b6aa05..7b1b5f070171 100644 --- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py index 511c29884288..b7a9fc810cfc 100644 --- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py +++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py index bb013a26a609..7a68af63a84b 100644 --- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py index 22e78b0756c0..45757badba43 100644 --- a/caffe2/contrib/fakelowp/test/test_fusions.py +++ b/caffe2/contrib/fakelowp/test/test_fusions.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + # Must happen before importing caffe2.python.* import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py index 4c82917f042c..5a91a00706ff 100644 --- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py +++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.fakelowp.init_shared_libs # noqa import numpy as np diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py index 83d0cc176def..02095286e1ee 100644 --- a/caffe2/contrib/fakelowp/test/test_int8_quant.py +++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + # Must happen before importing caffe2.python.* import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py index 698b839f3785..9ff0986116b6 100644 --- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py index 58161409fa80..e8512b4dcd74 100644 --- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py index 0ca76bd86ba9..a8d6640fa58e 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py index da7eae2708f3..f8fd03cbfb73 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py index ad26952a901c..207403f1bd0d 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py +++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py index 8eaff9e137ae..fbca9b8fe64c 100644 --- a/caffe2/contrib/gloo/gloo_test.py +++ b/caffe2/contrib/gloo/gloo_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py index 3f4685548281..2d4e9b518b9b 100644 --- a/caffe2/contrib/nccl/nccl_ops_test.py +++ b/caffe2/contrib/nccl/nccl_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py index b12acd151a71..4bedf0e0ecd6 100644 --- a/caffe2/contrib/nnpack/nnpack_ops_test.py +++ b/caffe2/contrib/nnpack/nnpack_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py index 5d968b0455fc..b8e2f8b37b2a 100644 --- a/caffe2/contrib/playground/AnyExp.py +++ b/caffe2/contrib/playground/AnyExp.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import abstractmethod diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py index b269777da675..dcfe61f14545 100644 --- a/caffe2/contrib/playground/AnyExpOnTerm.py +++ b/caffe2/contrib/playground/AnyExpOnTerm.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import json diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py index 89a9deb8989e..27e0c07f6384 100644 --- a/caffe2/contrib/playground/ModuleRegister.py +++ b/caffe2/contrib/playground/ModuleRegister.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import inspect import logging diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py index 9887a408cc01..5ea3d2a9035c 100644 --- a/caffe2/contrib/playground/checkpoint.py +++ b/caffe2/contrib/playground/checkpoint.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import pickle diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py index 53eb77d77701..2965ff3895ac 100644 --- a/caffe2/contrib/playground/compute_loss.py +++ b/caffe2/contrib/playground/compute_loss.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.contrib.playground.meter as Meter from caffe2.python import workspace diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py index 396b797ed1b6..e2f148231c6d 100644 --- a/caffe2/contrib/playground/compute_topk_accuracy.py +++ b/caffe2/contrib/playground/compute_topk_accuracy.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.contrib.playground.meter as Meter from caffe2.python import workspace diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py index 7e109e445d04..ed0158bbf087 100644 --- a/caffe2/contrib/playground/meter.py +++ b/caffe2/contrib/playground/meter.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import abstractmethod diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py index 0f5de5943a36..8eb1a3a00cdc 100644 --- a/caffe2/contrib/playground/module_map.py +++ b/caffe2/contrib/playground/module_map.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + # Input import caffe2.contrib.playground.resnetdemo.\ diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py index 41d8e3fdfae4..aaa977c08faa 100644 --- a/caffe2/contrib/playground/output_generator.py +++ b/caffe2/contrib/playground/output_generator.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import timeout_guard diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py index 52ce95ed5dab..58085dbc3721 100644 --- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py +++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py index cf893b598446..480070752e63 100644 --- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py +++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py index 174ffe1e034a..fa0fedd84a8c 100644 --- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py +++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python.models.resnet as resnet diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py index 974653446a22..5697d1301b8a 100644 --- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py +++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def gen_param_update_builder_fun(self, model, dataset, is_train): diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py index 01b51fa8450c..056ddd8c9ea0 100644 --- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py +++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging logging.basicConfig() diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py index 8a86289778ee..5378acd61886 100644 --- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py +++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py index 8b2647114b63..496ac22ffde5 100644 --- a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py +++ b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + # # example1 using gfs as input source. diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py index 4cc2d68cbfd7..419d6a25e95b 100644 --- a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py +++ b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def checkpoint(self, epoch): self.model_path = None diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py index d757896793ff..0a56d68257ee 100644 --- a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py +++ b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python import dyndep diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py index 2953503bbea5..c77b7ae88ba6 100644 --- a/caffe2/contrib/prof/cuda_profile_ops_test.py +++ b/caffe2/contrib/prof/cuda_profile_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.proto import caffe2_pb2 diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py index 9aece77bc09a..6f5ad1896e35 100644 --- a/caffe2/contrib/tensorboard/tensorboard.py +++ b/caffe2/contrib/tensorboard/tensorboard.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import click import collections diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py index a3c0e0e59723..ef12ce563cde 100644 --- a/caffe2/contrib/tensorboard/tensorboard_exporter.py +++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from builtins import bytes import copy diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py index 6b9c894e16fb..31ef8180fb57 100644 --- a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py +++ b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py index 494cb6fc7d12..8751be14ead5 100644 --- a/caffe2/contrib/tensorboard/tensorboard_test.py +++ b/caffe2/contrib/tensorboard/tensorboard_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import click.testing import numpy as np diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py index 3b21c8b66747..013e80a98773 100644 --- a/caffe2/contrib/warpctc/ctc_ops_test.py +++ b/caffe2/contrib/warpctc/ctc_ops_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from caffe2.proto import caffe2_pb2 diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py index 49cd2abb2cef..fbe1c8da377e 100755 --- a/caffe2/core/nomnigraph/op_gen.py +++ b/caffe2/core/nomnigraph/op_gen.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse from textwrap import dedent diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py index 2e90c548d50f..427b68420d39 100644 --- a/caffe2/distributed/file_store_handler_op_test.py +++ b/caffe2/distributed/file_store_handler_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import errno import os diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py index 3df69bf2701a..8f5d58e85185 100644 --- a/caffe2/distributed/redis_store_handler_op_test.py +++ b/caffe2/distributed/redis_store_handler_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import uuid diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py index 2abe697cface..05245be9b210 100644 --- a/caffe2/distributed/store_ops_test_util.py +++ b/caffe2/distributed/store_ops_test_util.py @@ -1,9 +1,9 @@ ## @package store_ops_test_util # Module caffe2.distributed.store_ops_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from multiprocessing import Process, Queue diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py index ff9ab7715c33..d97f076a7bb3 100644 --- a/caffe2/experiments/python/SparseTransformer.py +++ b/caffe2/experiments/python/SparseTransformer.py @@ -15,10 +15,10 @@ ## @package SparseTransformer # Module caffe2.experiments.python.SparseTransformer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace import scipy.sparse diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py index 386c9c4b7ebc..ff9b7a20bc73 100644 --- a/caffe2/experiments/python/convnet_benchmarks.py +++ b/caffe2/experiments/python/convnet_benchmarks.py @@ -15,10 +15,10 @@ ## @package convnet_benchmarks # Module caffe2.experiments.python.convnet_benchmarks -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + """ Benchmark for common convnets. diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py index dbe0dae4f0c2..1a795e2fcf0e 100644 --- a/caffe2/experiments/python/device_reduce_sum_bench.py +++ b/caffe2/experiments/python/device_reduce_sum_bench.py @@ -15,10 +15,10 @@ ## @package device_reduce_sum_bench # Module caffe2.experiments.python.device_reduce_sum_bench -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import itertools diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py index 6a4eb0e6b5b5..3fc4c8bf54fd 100644 --- a/caffe2/experiments/python/funhash_op_test.py +++ b/caffe2/experiments/python/funhash_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py index b7cf605c0c04..ec12517c03be 100644 --- a/caffe2/experiments/python/net_construct_bench.py +++ b/caffe2/experiments/python/net_construct_bench.py @@ -15,10 +15,10 @@ ## @package net_construct_bench # Module caffe2.experiments.python.net_construct_bench -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import logging diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py index 2af006249c7d..cfc7a0bb6165 100644 --- a/caffe2/experiments/python/sparse_funhash_op_test.py +++ b/caffe2/experiments/python/sparse_funhash_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py index 5849580f09e1..a22bf561ce86 100644 --- a/caffe2/experiments/python/sparse_reshape_op_test.py +++ b/caffe2/experiments/python/sparse_reshape_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py index 4cd04a16ea23..1e41e9ed8ddd 100644 --- a/caffe2/experiments/python/tt_contraction_op_test.py +++ b/caffe2/experiments/python/tt_contraction_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py index 10be7adcb453..27d13543348b 100644 --- a/caffe2/experiments/python/tt_pad_op_test.py +++ b/caffe2/experiments/python/tt_pad_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py index f79b7c8e7d9c..75b0c8b583be 100644 --- a/caffe2/perfkernels/hp_emblookup_codegen.py +++ b/caffe2/perfkernels/hp_emblookup_codegen.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import sys diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 09b5652e61f2..8582eff9ce19 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.proto import caffe2_pb2 import os import sys diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py index 663cc9e02864..22038715f289 100644 --- a/caffe2/python/allcompare_test.py +++ b/caffe2/python/allcompare_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py index 73be94feaf2b..59f4a5adb6a5 100644 --- a/caffe2/python/attention.py +++ b/caffe2/python/attention.py @@ -1,9 +1,9 @@ ## @package attention # Module caffe2.python.attention -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py index 8393ca7875aa..84d0d46490b0 100644 --- a/caffe2/python/benchmark_generator.py +++ b/caffe2/python/benchmark_generator.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import string import argparse diff --git a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py index 9b9a196e9770..ce96dbc1dd63 100644 --- a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py +++ b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py index bdba35545255..1b683be0d51e 100644 --- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py +++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import datetime diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py index 802d61025e30..39dba40df8a0 100644 --- a/caffe2/python/binarysize.py +++ b/caffe2/python/binarysize.py @@ -15,10 +15,10 @@ green, assuming that you have a xterm connection that supports color. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import subprocess import sys diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py index 2722c21d84d0..0e050ec32c44 100644 --- a/caffe2/python/brew.py +++ b/caffe2/python/brew.py @@ -1,9 +1,9 @@ ## @package model_helper_api # Module caffe2.python.model_helper_api -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import sys import copy diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py index 8b3d08977c2c..4973876a8008 100644 --- a/caffe2/python/brew_test.py +++ b/caffe2/python/brew_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, core, scope, workspace from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/build.py b/caffe2/python/build.py index 0f447265d5f4..862c031004c5 100644 --- a/caffe2/python/build.py +++ b/caffe2/python/build.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python._import_c_extension as C diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py index 1dd179c71caf..980c4fe40e08 100644 --- a/caffe2/python/cached_reader.py +++ b/caffe2/python/cached_reader.py @@ -1,9 +1,9 @@ ## @package cached_reader # Module caffe2.python.cached_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py index cdd96eb1f492..9d7797fc3ada 100644 --- a/caffe2/python/checkpoint.py +++ b/caffe2/python/checkpoint.py @@ -1,9 +1,9 @@ ## @package checkpoint # Module caffe2.python.checkpoint -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import logging diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py index a91bbf9910e2..90746747dd98 100644 --- a/caffe2/python/checkpoint_test.py +++ b/caffe2/python/checkpoint_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import Struct, ConstRecord from caffe2.python import core, workspace, model_helper diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py index aead1d599474..a0fd52e1fdbc 100644 --- a/caffe2/python/cnn.py +++ b/caffe2/python/cnn.py @@ -1,9 +1,9 @@ ## @package cnn # Module caffe2.python.cnn -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, workspace from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/context.py b/caffe2/python/context.py index 928807ba2805..28815bb7f36b 100644 --- a/caffe2/python/context.py +++ b/caffe2/python/context.py @@ -1,9 +1,9 @@ ## @package context # Module caffe2.python.context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import threading import six diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py index 6a1f77f5ecf8..6c259d326a19 100644 --- a/caffe2/python/context_test.py +++ b/caffe2/python/context_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context, test_util from threading import Thread diff --git a/caffe2/python/control.py b/caffe2/python/control.py index dd332f745f9a..6b0654d6f26e 100644 --- a/caffe2/python/control.py +++ b/caffe2/python/control.py @@ -11,10 +11,10 @@ If """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from future.utils import viewitems diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py index 5a8d24cf55d8..a0e85f4d0bc1 100644 --- a/caffe2/python/control_ops_grad.py +++ b/caffe2/python/control_ops_grad.py @@ -1,9 +1,9 @@ ## @package control_ops_grad # Module caffe2.python.control_ops_grad -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/control_ops_grad_test.py b/caffe2/python/control_ops_grad_test.py index a84b9ca0a168..f637e38a5e33 100644 --- a/caffe2/python/control_ops_grad_test.py +++ b/caffe2/python/control_ops_grad_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import core, test_util, workspace diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py index 76ab14a7bc65..cfff82de318b 100644 --- a/caffe2/python/control_ops_util.py +++ b/caffe2/python/control_ops_util.py @@ -1,9 +1,9 @@ ## @package control_ops_util # Module caffe2.python.control_ops_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py index e51aeffa8b04..3f9df172d2b7 100644 --- a/caffe2/python/control_test.py +++ b/caffe2/python/control_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import control, core, test_util, workspace diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py index 44f81d6e2d13..18033661a69e 100644 --- a/caffe2/python/convert.py +++ b/caffe2/python/convert.py @@ -1,9 +1,9 @@ ## @package workspace # Module caffe2.python.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2, torch_pb2 diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py index 82c969c901ea..a1dc52aad2d9 100644 --- a/caffe2/python/convert_test.py +++ b/caffe2/python/convert_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import convert, workspace from caffe2.proto import caffe2_pb2, torch_pb2 diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 3b493277a182..6d7c503e2c81 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -1,9 +1,9 @@ ## @package core # Module caffe2.python.core -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple, OrderedDict, defaultdict from past.builtins import basestring diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index 8b229029f5f7..3674b7aa4585 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from future.utils import bytes_to_native_str from hypothesis import given, settings diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 8660f5cc2106..b0f5b11f0d1c 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from inspect import currentframe, getframeinfo import unittest diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py index a009f8f0fa31..703ae604c654 100644 --- a/caffe2/python/crf.py +++ b/caffe2/python/crf.py @@ -1,6 +1,6 @@ ## @package crf # Module caffe2.python.crf -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import brew, core, model_helper, recurrent diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py index dd1c8720bfb1..9bc0372c50c0 100644 --- a/caffe2/python/crf_predict.py +++ b/caffe2/python/crf_predict.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python.crf import CRFWithLoss diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py index 970a7c6d4a8f..052bbbf4e6bf 100644 --- a/caffe2/python/crf_viterbi_test.py +++ b/caffe2/python/crf_viterbi_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, crf from caffe2.python.cnn import CNNModelHelper diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 7f5527472cc2..95abb7159d42 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -1,8 +1,8 @@ ## @package data_parallel_model # Module caffe2.python.data_parallel_model -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from collections import OrderedDict from future.utils import viewitems, viewkeys, viewvalues diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py index e106dee97039..a0dbb3037c2c 100644 --- a/caffe2/python/data_parallel_model_test.py +++ b/caffe2/python/data_parallel_model_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from future.utils import viewkeys from multiprocessing import Process, Queue diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py index eb49da78c0af..698a8953ef13 100644 --- a/caffe2/python/data_workers.py +++ b/caffe2/python/data_workers.py @@ -1,9 +1,9 @@ ## @package data_workers # Module caffe2.python.data_workers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + ''' diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py index 1abd8dfa28d7..4669aaf59476 100644 --- a/caffe2/python/data_workers_test.py +++ b/caffe2/python/data_workers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py index 5221262582ee..ff6e9c6860f6 100644 --- a/caffe2/python/dataio.py +++ b/caffe2/python/dataio.py @@ -15,10 +15,10 @@ See `dataset.py` for an example of implementation. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.schema import Field, Struct, from_blob_list diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py index 26f1c0902f71..0c45fb50aed9 100644 --- a/caffe2/python/dataio_test.py +++ b/caffe2/python/dataio_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.dataio import ( CompositeReader, diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py index 387dbbaead58..4c2d4c806476 100644 --- a/caffe2/python/dataset.py +++ b/caffe2/python/dataset.py @@ -10,10 +10,10 @@ is stored as a set of native Caffe2 tensors, thus no type conversion or deserialization is necessary. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py index 9296f1c6b7db..265b19251717 100644 --- a/caffe2/python/db_file_reader.py +++ b/caffe2/python/db_file_reader.py @@ -1,9 +1,9 @@ ## @package db_file_reader # Module caffe2.python.db_file_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope, workspace, _import_c_extension as C from caffe2.python.dataio import Reader diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py index f642202b36f0..f0f5d2770dc0 100644 --- a/caffe2/python/db_test.py +++ b/caffe2/python/db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py index 0a16420f6d5a..904f1731e960 100644 --- a/caffe2/python/docs/formatter.py +++ b/caffe2/python/docs/formatter.py @@ -1,9 +1,9 @@ ## @package formatter # Module caffe2.python.docs.formatter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.docs.parser import Parser diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py index 1bc41b7d1ccb..c5a7df369bc2 100644 --- a/caffe2/python/docs/generator.py +++ b/caffe2/python/docs/generator.py @@ -1,9 +1,9 @@ ## @package generator # Module caffe2.python.docs.generator -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os from caffe2.python import core, workspace diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py index 5cb1fdcf5d7b..3fd78507346e 100644 --- a/caffe2/python/docs/github.py +++ b/caffe2/python/docs/github.py @@ -1,9 +1,9 @@ ## @package github # Module caffe2.python.docs.github -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os from caffe2.python.docs.formatter import Markdown diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py index 024989c97e25..a4edb6e07246 100644 --- a/caffe2/python/docs/parser.py +++ b/caffe2/python/docs/parser.py @@ -1,9 +1,9 @@ ## @package parser # Module caffe2.python.docs.parser -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import re diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py index 8bea14423875..0382cc3a8212 100644 --- a/caffe2/python/dyndep.py +++ b/caffe2/python/dyndep.py @@ -1,9 +1,9 @@ ## @package dyndep # Module caffe2.python.dyndep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import ctypes import os diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py index a4d66036b93d..33dbf757dda4 100644 --- a/caffe2/python/embedding_generation_benchmark.py +++ b/caffe2/python/embedding_generation_benchmark.py @@ -1,9 +1,9 @@ ## @package embedding_generation_benchmark # Module caffe2.python.embedding_generation_benchmark -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, utils, model_helper diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py index fb2059f94868..59e85431e8bf 100644 --- a/caffe2/python/examples/char_rnn.py +++ b/caffe2/python/examples/char_rnn.py @@ -1,9 +1,9 @@ ## @package char_rnn # Module caffe2.python.examples.char_rnn -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, model_helper, utils, brew from caffe2.python.rnn_cell import LSTM diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py index b29b3b806001..af56069a7be0 100644 --- a/caffe2/python/examples/lmdb_create_example.py +++ b/caffe2/python/examples/lmdb_create_example.py @@ -1,9 +1,9 @@ ## @package lmdb_create_example # Module caffe2.python.examples.lmdb_create_example -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import numpy as np diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py index cbe9491d9cf6..822a0a2950ba 100644 --- a/caffe2/python/experiment_util.py +++ b/caffe2/python/experiment_util.py @@ -1,9 +1,9 @@ ## @package experiment_util # Module caffe2.python.experiment_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import datetime import time diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py index c533ae6d77bc..06c6707dcce9 100644 --- a/caffe2/python/extension_loader.py +++ b/caffe2/python/extension_loader.py @@ -1,9 +1,9 @@ ## @package extension_loader # Module caffe2.python.extension_loader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import ctypes import sys diff --git a/caffe2/python/fakefp16_transform_lib.py b/caffe2/python/fakefp16_transform_lib.py index 885f15732055..c3f142061479 100644 --- a/caffe2/python/fakefp16_transform_lib.py +++ b/caffe2/python/fakefp16_transform_lib.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + import caffe2.python._import_c_extension as C from caffe2.proto.caffe2_pb2 import NetDef diff --git a/caffe2/python/fakelowp/init_shared_libs.py b/caffe2/python/fakelowp/init_shared_libs.py index d289c7c4a97d..2a98de4571aa 100644 --- a/caffe2/python/fakelowp/init_shared_libs.py +++ b/caffe2/python/fakelowp/init_shared_libs.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import ctypes import os diff --git a/caffe2/python/fakelowp/test_utils.py b/caffe2/python/fakelowp/test_utils.py index 75e4422f3ccc..4a31a92e5bce 100644 --- a/caffe2/python/fakelowp/test_utils.py +++ b/caffe2/python/fakelowp/test_utils.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import sys import numpy as np diff --git a/caffe2/python/filler_test.py b/caffe2/python/filler_test.py index 52ea756d5bea..9aff384e99af 100644 --- a/caffe2/python/filler_test.py +++ b/caffe2/python/filler_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, test_util, workspace diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py index 7c26f69a0c43..d32acb3d8a90 100644 --- a/caffe2/python/functional.py +++ b/caffe2/python/functional.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py index e7803e829bb4..d90943761aa4 100644 --- a/caffe2/python/functional_test.py +++ b/caffe2/python/functional_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py index d2ecf118ea27..a7e5d714b63c 100644 --- a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py +++ b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py index 1b492229a433..3f8dd83b5538 100644 --- a/caffe2/python/gradient_check_test.py +++ b/caffe2/python/gradient_check_test.py @@ -2,10 +2,10 @@ # can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS # FILE. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import ( brew, diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py index b1cdcc2bbb56..afb8d5071492 100644 --- a/caffe2/python/gradient_checker.py +++ b/caffe2/python/gradient_checker.py @@ -1,9 +1,9 @@ ## @package gradient_checker # Module caffe2.python.gradient_checker -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py index e6caa2cae1eb..049a9152878a 100644 --- a/caffe2/python/gru_cell.py +++ b/caffe2/python/gru_cell.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools from caffe2.python import brew, rnn_cell diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py index 6bc3779a4ca1..948c55ac88ce 100644 --- a/caffe2/python/helpers/algebra.py +++ b/caffe2/python/helpers/algebra.py @@ -1,9 +1,9 @@ ## @package algebra # Module caffe2.python.helpers.algebra -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs): diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py index ac6978be8064..a112e9b84c5d 100644 --- a/caffe2/python/helpers/arg_scope.py +++ b/caffe2/python/helpers/arg_scope.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import contextlib import copy import threading diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py index 3f8955331d4e..fae0011bf1f6 100644 --- a/caffe2/python/helpers/array_helpers.py +++ b/caffe2/python/helpers/array_helpers.py @@ -1,9 +1,9 @@ ## @package arra_helpers # Module caffe2.python.helpers.array_helpers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def concat(model, blobs_in, blob_out, **kwargs): diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py index a738a71fe44c..c6f71d0761a5 100644 --- a/caffe2/python/helpers/control_ops.py +++ b/caffe2/python/helpers/control_ops.py @@ -1,9 +1,9 @@ ## @package control_ops # Module caffe2.python.helpers.control_ops -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.control_ops_util import add_if_op, add_while_op diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py index bb88b2e3757f..dfca165084df 100644 --- a/caffe2/python/helpers/conv.py +++ b/caffe2/python/helpers/conv.py @@ -1,9 +1,9 @@ ## @package conv # Module caffe2.python.helpers.conv -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling import initializers diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py index 6e642a393da4..d5772cb7653e 100644 --- a/caffe2/python/helpers/db_input.py +++ b/caffe2/python/helpers/db_input.py @@ -1,9 +1,9 @@ ## @package db_input # Module caffe2.python.helpers.db_input -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def db_input(model, blobs_out, batch_size, db, db_type): dbreader_name = "dbreader_" + db diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py index 6fbb5bcda99a..d7280318f60d 100644 --- a/caffe2/python/helpers/dropout.py +++ b/caffe2/python/helpers/dropout.py @@ -1,9 +1,9 @@ ## @package dropout # Module caffe2.python.helpers.dropout -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs): diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py index 55fbd708489c..ef9184d00dd2 100644 --- a/caffe2/python/helpers/elementwise_linear.py +++ b/caffe2/python/helpers/elementwise_linear.py @@ -1,9 +1,9 @@ ## @package elementwise_linear # Module caffe2.python.helpers.elementwise_linear -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py index 9d61dc7ac145..0feb2b65745e 100644 --- a/caffe2/python/helpers/fc.py +++ b/caffe2/python/helpers/fc.py @@ -1,9 +1,9 @@ ## @package fc # Module caffe2.python.helpers.fc -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling import initializers diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py index f773cc3114de..3a8be3bb056a 100644 --- a/caffe2/python/helpers/nonlinearity.py +++ b/caffe2/python/helpers/nonlinearity.py @@ -1,9 +1,9 @@ ## @package nonlinearity # Module caffe2.python.helpers.nonlinearity -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py index 621f565b5455..b13b43f6859a 100644 --- a/caffe2/python/helpers/normalization.py +++ b/caffe2/python/helpers/normalization.py @@ -1,9 +1,9 @@ ## @package normalization # Module caffe2.python.helpers.normalization -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py index 412d55434d16..9e6fc784f289 100644 --- a/caffe2/python/helpers/pooling.py +++ b/caffe2/python/helpers/pooling.py @@ -2,10 +2,10 @@ # Module caffe2.python.helpers.pooling ## @package fc # Module caffe2.python.helpers.pooling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs): diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py index 59defe9e236b..178620eab593 100644 --- a/caffe2/python/helpers/tools.py +++ b/caffe2/python/helpers/tools.py @@ -1,9 +1,9 @@ ## @package tools # Module caffe2.python.helpers.tools -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def image_input( diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py index bee36347808a..02883af7402d 100644 --- a/caffe2/python/helpers/train.py +++ b/caffe2/python/helpers/train.py @@ -1,9 +1,9 @@ ## @package train # Module caffe2.python.helpers.train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/hip_test_util.py b/caffe2/python/hip_test_util.py index 3910c9e5c2ce..beab3be1c40a 100644 --- a/caffe2/python/hip_test_util.py +++ b/caffe2/python/hip_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py index e98056f9cd88..ec465c12240e 100644 --- a/caffe2/python/hsm_util.py +++ b/caffe2/python/hsm_util.py @@ -1,9 +1,9 @@ ## @package hsm_util # Module caffe2.python.hsm_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import hsm_pb2 diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 897be5fab44a..8a286383f60f 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np import copy diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index 797010b46890..2000e269969e 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -34,10 +34,10 @@ implemented on the CPU. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import ( workspace, device_checker, gradient_checker, test_util, core) diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py index 956f10be8831..23ecd79062f7 100644 --- a/caffe2/python/ideep/LRN_op_test.py +++ b/caffe2/python/ideep/LRN_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/adam_op_test.py b/caffe2/python/ideep/adam_op_test.py index a0d9b2ce014f..5ac0395bff63 100644 --- a/caffe2/python/ideep/adam_op_test.py +++ b/caffe2/python/ideep/adam_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/blobs_queue_db_test.py b/caffe2/python/ideep/blobs_queue_db_test.py index ded18e89c5ae..966fcc23d47d 100644 --- a/caffe2/python/ideep/blobs_queue_db_test.py +++ b/caffe2/python/ideep/blobs_queue_db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/channel_shuffle_op_test.py b/caffe2/python/ideep/channel_shuffle_op_test.py index 8c3eea3d8618..b4cedca61061 100644 --- a/caffe2/python/ideep/channel_shuffle_op_test.py +++ b/caffe2/python/ideep/channel_shuffle_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py index c28a7f1fe52c..75c9ceeba0e4 100644 --- a/caffe2/python/ideep/concat_split_op_test.py +++ b/caffe2/python/ideep/concat_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py index e82d8aec5515..ae4473ea4864 100644 --- a/caffe2/python/ideep/conv_op_test.py +++ b/caffe2/python/ideep/conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import sys diff --git a/caffe2/python/ideep/conv_transpose_test.py b/caffe2/python/ideep/conv_transpose_test.py index be35dbd8a382..eeda2ea43a2d 100644 --- a/caffe2/python/ideep/conv_transpose_test.py +++ b/caffe2/python/ideep/conv_transpose_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py index f24333745741..18ce574b623b 100644 --- a/caffe2/python/ideep/convfusion_op_test.py +++ b/caffe2/python/ideep/convfusion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py index 4b0a15bd999a..668282f2e159 100644 --- a/caffe2/python/ideep/copy_op_test.py +++ b/caffe2/python/ideep/copy_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py index efecfb501bff..33b0a52a7421 100644 --- a/caffe2/python/ideep/dropout_op_test.py +++ b/caffe2/python/ideep/dropout_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from hypothesis import given diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py index 9daf34088fc0..11a35d6b2b28 100644 --- a/caffe2/python/ideep/elementwise_sum_op_test.py +++ b/caffe2/python/ideep/elementwise_sum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py index 4a4fb7319b25..3693a217bb4b 100644 --- a/caffe2/python/ideep/expanddims_squeeze_op_test.py +++ b/caffe2/python/ideep/expanddims_squeeze_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py index 9e29bfaed919..6549bb6ad6bb 100644 --- a/caffe2/python/ideep/fc_op_test.py +++ b/caffe2/python/ideep/fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from functools import reduce diff --git a/caffe2/python/ideep/leaky_relu_op_test.py b/caffe2/python/ideep/leaky_relu_op_test.py index 8a68d2e608ef..6d84f88f4fe2 100644 --- a/caffe2/python/ideep/leaky_relu_op_test.py +++ b/caffe2/python/ideep/leaky_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py index 06d0e9be0e57..596bab0ad3cc 100644 --- a/caffe2/python/ideep/moment_sgd_op_test.py +++ b/caffe2/python/ideep/moment_sgd_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py index 6d40a88b5c13..dc928c264082 100644 --- a/caffe2/python/ideep/operator_fallback_op_test.py +++ b/caffe2/python/ideep/operator_fallback_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py index 8a967dcf9c08..a259e01bab10 100644 --- a/caffe2/python/ideep/order_switch_op_test.py +++ b/caffe2/python/ideep/order_switch_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py index 9659d3961338..9ab3fcddbadb 100644 --- a/caffe2/python/ideep/pool_op_test.py +++ b/caffe2/python/ideep/pool_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/pre_convert_test.py b/caffe2/python/ideep/pre_convert_test.py index a32eedd74469..6c0b7ca5d7a7 100644 --- a/caffe2/python/ideep/pre_convert_test.py +++ b/caffe2/python/ideep/pre_convert_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py index bd05c69381c5..e2fda68aed2b 100644 --- a/caffe2/python/ideep/relu_op_test.py +++ b/caffe2/python/ideep/relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/reshape_op_test.py b/caffe2/python/ideep/reshape_op_test.py index c9714f6eb4a5..c2bca948a52c 100644 --- a/caffe2/python/ideep/reshape_op_test.py +++ b/caffe2/python/ideep/reshape_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.test_util import TestCase from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py index e1ab30c12e45..47114832f85d 100644 --- a/caffe2/python/ideep/shape_op_test.py +++ b/caffe2/python/ideep/shape_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/sigmoid_op_test.py b/caffe2/python/ideep/sigmoid_op_test.py index b67932108084..2b5eb0e3a2b5 100644 --- a/caffe2/python/ideep/sigmoid_op_test.py +++ b/caffe2/python/ideep/sigmoid_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py index 9043061514a0..b76d6509609b 100644 --- a/caffe2/python/ideep/softmax_op_test.py +++ b/caffe2/python/ideep/softmax_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py index 25b83e2447fc..618a0e7fbfc3 100644 --- a/caffe2/python/ideep/spatial_bn_op_test.py +++ b/caffe2/python/ideep/spatial_bn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py index b0483cf4c4b6..aa1c5bc260fa 100644 --- a/caffe2/python/ideep/test_ideep_net.py +++ b/caffe2/python/ideep/test_ideep_net.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py index 6345b76735a7..962d4051718b 100644 --- a/caffe2/python/ideep/transform_ideep_net.py +++ b/caffe2/python/ideep/transform_ideep_net.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import copy diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py index b02085a3ba3b..8b324ed964ae 100644 --- a/caffe2/python/ideep/transpose_op_test.py +++ b/caffe2/python/ideep/transpose_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py index 2a0b3ec3e7b0..b1e46fca4851 100644 --- a/caffe2/python/ideep/weightedsum_op_test.py +++ b/caffe2/python/ideep/weightedsum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py index e131ee027c35..7129ed14ba74 100644 --- a/caffe2/python/ideep_test_util.py +++ b/caffe2/python/ideep_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py index 90e5a4d76b6d..7c3dda3b320c 100644 --- a/caffe2/python/layer_model_helper.py +++ b/caffe2/python/layer_model_helper.py @@ -1,9 +1,9 @@ # @package layer_model_helper # Module caffe2.python.layer_model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, model_helper, schema, scope, utils, muji from caffe2.python.modeling.parameter_info import ( diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py index 9ceb1310bf30..9284b9b9e687 100644 --- a/caffe2/python/layer_model_instantiator.py +++ b/caffe2/python/layer_model_instantiator.py @@ -1,9 +1,9 @@ ## @package layer_model_instantiator # Module caffe2.python.layer_model_instantiator -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import InstantiationContext diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py index 5d87dbd7522a..518412b9e90c 100644 --- a/caffe2/python/layer_parameter_sharing_test.py +++ b/caffe2/python/layer_parameter_sharing_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope from caffe2.python.modeling.parameter_sharing import ( diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py index 2f2e23062e34..ae28e82b98cc 100644 --- a/caffe2/python/layer_test_util.py +++ b/caffe2/python/layer_test_util.py @@ -1,9 +1,9 @@ ## @package layer_test_util # Module caffe2.python.layer_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py index 2a09dc8419a6..487b7751fd08 100644 --- a/caffe2/python/layers/__init__.py +++ b/caffe2/python/layers/__init__.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from importlib import import_module import pkgutil diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py index c081e8573038..146a0bdb1974 100644 --- a/caffe2/python/layers/adaptive_weight.py +++ b/caffe2/python/layers/adaptive_weight.py @@ -1,6 +1,6 @@ # @package adaptive_weight # Module caffe2.fb.python.layers.adaptive_weight -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import core, schema diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py index 0ffa46afb2b3..1a0fd8b295f3 100644 --- a/caffe2/python/layers/add_bias.py +++ b/caffe2/python/layers/add_bias.py @@ -1,9 +1,9 @@ ## @package add_bias # Module caffe2.python.layers.add_bias -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py index 2409eca551a1..89c5014f5c5c 100644 --- a/caffe2/python/layers/arc_cosine_feature_map.py +++ b/caffe2/python/layers/arc_cosine_feature_map.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py index 48b6ebcf8f58..0a5323625419 100644 --- a/caffe2/python/layers/batch_huber_loss.py +++ b/caffe2/python/layers/batch_huber_loss.py @@ -1,9 +1,9 @@ # @package batch_huber_loss # Module caffe2.python.layers.batch_huber_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py index a560a3f654a9..46b0e4d42cdf 100644 --- a/caffe2/python/layers/batch_lr_loss.py +++ b/caffe2/python/layers/batch_lr_loss.py @@ -1,9 +1,9 @@ ## @package batch_lr_loss # Module caffe2.python.layers.batch_lr_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py index 89da74f3c1e9..b0dd63ab09c8 100644 --- a/caffe2/python/layers/batch_mse_loss.py +++ b/caffe2/python/layers/batch_mse_loss.py @@ -1,9 +1,9 @@ ## @package batch_mse_loss # Module caffe2.python.layers.batch_mse_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py index 9fe3ee51eb56..6395b09ff67f 100644 --- a/caffe2/python/layers/batch_normalization.py +++ b/caffe2/python/layers/batch_normalization.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py index 9ef8cf563dbe..84e7d4873f50 100644 --- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py +++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py @@ -1,9 +1,9 @@ ## @package batch_sigmoid_cross_entropy_loss # Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py index d5f9413ef96a..30667a04c159 100644 --- a/caffe2/python/layers/batch_softmax_loss.py +++ b/caffe2/python/layers/batch_softmax_loss.py @@ -1,9 +1,9 @@ ## @package batch_softmax_loss # Module caffe2.python.layers.batch_softmax_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py index cf8ecfd99045..a37fab463581 100644 --- a/caffe2/python/layers/blob_weighted_sum.py +++ b/caffe2/python/layers/blob_weighted_sum.py @@ -1,9 +1,9 @@ ## @package BlobWeightedSum # Module caffe2.python.layers.blob_weighted_sum -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py index 4e6a60fdaa57..389de8c241e8 100644 --- a/caffe2/python/layers/bpr_loss.py +++ b/caffe2/python/layers/bpr_loss.py @@ -1,9 +1,9 @@ ## @package bpr_loss # Module caffe2.python.layers.bpr_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py index 3c750e7b136f..2c200a922fdd 100644 --- a/caffe2/python/layers/bucket_weighted.py +++ b/caffe2/python/layers/bucket_weighted.py @@ -1,9 +1,9 @@ ## @package bucket_weighted # Module caffe2.python.layers.bucket_weighted -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py index b8c999bc256e..29c63f3d8948 100644 --- a/caffe2/python/layers/build_index.py +++ b/caffe2/python/layers/build_index.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py index 062485757edc..fb1dc6ab6dbf 100644 --- a/caffe2/python/layers/concat.py +++ b/caffe2/python/layers/concat.py @@ -1,9 +1,9 @@ ## @package concat # Module caffe2.python.layers.concat -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py index 06e9d9cd9b66..d160ed8206b3 100644 --- a/caffe2/python/layers/constant_weight.py +++ b/caffe2/python/layers/constant_weight.py @@ -1,9 +1,9 @@ # @package constant_weight # Module caffe2.fb.python.layers.constant_weight -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py index bb22acf0cafa..e98bac7e2d80 100644 --- a/caffe2/python/layers/conv.py +++ b/caffe2/python/layers/conv.py @@ -1,9 +1,9 @@ ## @package conv # Module caffe2.python.layers.conv -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py index a5d3f01a440e..4bc0cf2785b2 100644 --- a/caffe2/python/layers/dropout.py +++ b/caffe2/python/layers/dropout.py @@ -1,8 +1,8 @@ # Module caffe2.python.layers.dropout -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py index a9eeceff2c21..9220f22165a3 100644 --- a/caffe2/python/layers/fc.py +++ b/caffe2/python/layers/fc.py @@ -1,9 +1,9 @@ ## @package fc # Module caffe2.python.layers.fc -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.helpers.arg_scope import get_current_scope from caffe2.python import schema diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py index 6a48f572ddba..b3c2eb346f96 100644 --- a/caffe2/python/layers/fc_with_bootstrap.py +++ b/caffe2/python/layers/fc_with_bootstrap.py @@ -1,6 +1,6 @@ ## @package fc_with_bootstrap # Module caffe2.python.layers.fc_with_bootstrap -from __future__ import absolute_import, division, print_function, unicode_literals + import math diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py index e8923a8e5b9c..2899af618b79 100644 --- a/caffe2/python/layers/fc_without_bias.py +++ b/caffe2/python/layers/fc_without_bias.py @@ -1,9 +1,9 @@ ## @package fc_without_bias # Module caffe2.python.layers.fc_without_bias -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py index 69fe91a48691..ca004d136ded 100644 --- a/caffe2/python/layers/feature_sparse_to_dense.py +++ b/caffe2/python/layers/feature_sparse_to_dense.py @@ -1,6 +1,6 @@ # @package sparse_to_dense # Module caffe2.python.layers.sparse_to_dense -from __future__ import absolute_import, division, print_function, unicode_literals + from collections import defaultdict diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py index 53d5c050242f..c6d156fd68ce 100644 --- a/caffe2/python/layers/functional.py +++ b/caffe2/python/layers/functional.py @@ -1,9 +1,9 @@ # @package functional # Module caffe2.python.layers.functional -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema, scope, workspace from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py index 1289c097902c..da468d5db90c 100644 --- a/caffe2/python/layers/gather_record.py +++ b/caffe2/python/layers/gather_record.py @@ -1,9 +1,9 @@ ## @package gather_record # Module caffe2.python.layers.gather_record -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py index 63da1f04abf4..4c24223cbc8d 100644 --- a/caffe2/python/layers/homotopy_weight.py +++ b/caffe2/python/layers/homotopy_weight.py @@ -1,10 +1,10 @@ # @package homotopy_weight # Module caffe2.fb.python.layers.homotopy_weight -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py index e2282e051611..7e4987270660 100644 --- a/caffe2/python/layers/label_smooth.py +++ b/caffe2/python/layers/label_smooth.py @@ -15,10 +15,10 @@ # @package label_smooth # Module caffe2.python.layers.label_smooth -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py index fb93effbff2d..a16b731a2f78 100644 --- a/caffe2/python/layers/last_n_window_collector.py +++ b/caffe2/python/layers/last_n_window_collector.py @@ -1,9 +1,9 @@ ## @package last_n_window_collector # Module caffe2.python.layers.last_n_window_collector -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py index 0dc6795994cb..580a03bfc5da 100644 --- a/caffe2/python/layers/layer_normalization.py +++ b/caffe2/python/layers/layer_normalization.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py index 216d0b2e3286..abcdd1596220 100644 --- a/caffe2/python/layers/layers.py +++ b/caffe2/python/layers/layers.py @@ -1,6 +1,6 @@ ## @package layers # Module caffe2.python.layers.layers -from __future__ import absolute_import, division, print_function, unicode_literals + import logging from collections import namedtuple diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py index 15267752caa3..6f97ade23ef4 100644 --- a/caffe2/python/layers/margin_rank_loss.py +++ b/caffe2/python/layers/margin_rank_loss.py @@ -1,9 +1,9 @@ ## @package random_neg_rank_loss # Module caffe2.python.layers.random_neg_rank_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema, core from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py index 117dd7904787..68c27b587567 100644 --- a/caffe2/python/layers/merge_id_lists.py +++ b/caffe2/python/layers/merge_id_lists.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py index 30cb6ace2b81..5020e5432c2a 100644 --- a/caffe2/python/layers/pairwise_similarity.py +++ b/caffe2/python/layers/pairwise_similarity.py @@ -1,9 +1,9 @@ ## @package dot_product # Module caffe2.python.layers.dot_product -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py index 19ddda2b6dcf..12e26bcd774e 100644 --- a/caffe2/python/layers/position_weighted.py +++ b/caffe2/python/layers/position_weighted.py @@ -1,9 +1,9 @@ ## @package position_weighted # Module caffe2.python.layers.position_weighted -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py index 6056da4ba7cf..bde05ab97147 100644 --- a/caffe2/python/layers/random_fourier_features.py +++ b/caffe2/python/layers/random_fourier_features.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py index 3819a1971da4..21b9c44f2a79 100644 --- a/caffe2/python/layers/reservoir_sampling.py +++ b/caffe2/python/layers/reservoir_sampling.py @@ -1,9 +1,9 @@ ## @package reservoir_sampling # Module caffe2.python.layers.reservoir_sampling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py index 1c617326da7f..034c897e2c2f 100644 --- a/caffe2/python/layers/sampling_train.py +++ b/caffe2/python/layers/sampling_train.py @@ -1,9 +1,9 @@ ## @package sampling_train # Module caffe2.python.layers.sampling_train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer, get_layer_class diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py index 911fd8391e3f..403cc5a4a51c 100644 --- a/caffe2/python/layers/sampling_trainable_mixin.py +++ b/caffe2/python/layers/sampling_trainable_mixin.py @@ -1,9 +1,9 @@ ## @package sampling_trainable_mixin # Module caffe2.python.layers.sampling_trainable_mixin -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import abc import six diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py index 65e44bece97c..49e42ca308d7 100644 --- a/caffe2/python/layers/select_record_by_context.py +++ b/caffe2/python/layers/select_record_by_context.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py index d7b96d956d08..58f30ac71f19 100644 --- a/caffe2/python/layers/semi_random_features.py +++ b/caffe2/python/layers/semi_random_features.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py index 8275d83d8734..3e03888e57dc 100644 --- a/caffe2/python/layers/sparse_dropout_with_replacement.py +++ b/caffe2/python/layers/sparse_dropout_with_replacement.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py index 3927b199fbdf..c3ada99dc4a7 100644 --- a/caffe2/python/layers/sparse_feature_hash.py +++ b/caffe2/python/layers/sparse_feature_hash.py @@ -1,9 +1,9 @@ ## @package sparse_feature_hash # Module caffe2.python.layers.sparse_feature_hash -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema, core from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py index 30cb60266c4d..dd1c42606063 100644 --- a/caffe2/python/layers/sparse_lookup.py +++ b/caffe2/python/layers/sparse_lookup.py @@ -1,9 +1,9 @@ ## @package sparse_lookup # Module caffe2.python.layers.sparse_lookup -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.optimizer import FP16_ENGINES, Optimizer from caffe2.python.helpers.arg_scope import get_current_scope diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py index a83881f5a091..58e569a272c7 100644 --- a/caffe2/python/layers/split.py +++ b/caffe2/python/layers/split.py @@ -1,9 +1,9 @@ ## @package split # Module caffe2.python.layers.split -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py index 28b7312dbcaa..5161ee2e1a96 100644 --- a/caffe2/python/layers/tags.py +++ b/caffe2/python/layers/tags.py @@ -1,9 +1,9 @@ ## @package tags # Module caffe2.python.layers.tags -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import six diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py index 46ed29bbaa41..5581371d008d 100644 --- a/caffe2/python/layers/uniform_sampling.py +++ b/caffe2/python/layers/uniform_sampling.py @@ -1,9 +1,9 @@ ## @package uniform_sampling # Module caffe2.python.layers.uniform_sampling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py index 4d037a891ade..e084a011d357 100644 --- a/caffe2/python/layers_test.py +++ b/caffe2/python/layers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/lazy_dyndep.py b/caffe2/python/lazy_dyndep.py index e1799838f4b2..e53d4fda350b 100644 --- a/caffe2/python/lazy_dyndep.py +++ b/caffe2/python/lazy_dyndep.py @@ -1,9 +1,9 @@ ## @package lazy_dyndep # Module caffe2.python.lazy_dyndep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os from caffe2.python import dyndep, lazy diff --git a/caffe2/python/lazy_dyndep_test.py b/caffe2/python/lazy_dyndep_test.py index 881215ac36e3..1441facd3a6f 100644 --- a/caffe2/python/lazy_dyndep_test.py +++ b/caffe2/python/lazy_dyndep_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py index f08e9147d3ba..718b7fb3a987 100644 --- a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py +++ b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py index d73db5aaa36c..a38d442dd952 100644 --- a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py +++ b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py index cfa53a81610c..29f819ec622e 100644 --- a/caffe2/python/lstm_benchmark.py +++ b/caffe2/python/lstm_benchmark.py @@ -1,9 +1,9 @@ ## @package lstm_benchmark # Module caffe2.python.lstm_benchmark -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, utils, rnn_cell, model_helper diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py index c299c817ace4..a728fc4e2157 100644 --- a/caffe2/python/memonger.py +++ b/caffe2/python/memonger.py @@ -1,9 +1,9 @@ ## @package memonger # Module caffe2.python.memonger -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import networkx as nx import collections diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py index 7d5c52224b1c..8584e8d5e4cc 100644 --- a/caffe2/python/memonger_test.py +++ b/caffe2/python/memonger_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py index 73df4820a5d1..2b084bea591b 100644 --- a/caffe2/python/mkl/mkl_LRN_op_test.py +++ b/caffe2/python/mkl/mkl_LRN_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py index 35eae62d5be1..ae42902d9102 100644 --- a/caffe2/python/mkl/mkl_LRN_speed_test.py +++ b/caffe2/python/mkl/mkl_LRN_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py index a1a96ca755d9..8b01f8885b1c 100644 --- a/caffe2/python/mkl/mkl_concat_op_test.py +++ b/caffe2/python/mkl/mkl_concat_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py index 38ceb680bb6d..f1fe7b062318 100644 --- a/caffe2/python/mkl/mkl_conv_op_test.py +++ b/caffe2/python/mkl/mkl_conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py index 633865cd5047..b2baeb9ef1af 100644 --- a/caffe2/python/mkl/mkl_copy_op_test.py +++ b/caffe2/python/mkl/mkl_copy_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py index eab454ffe105..0709b5afd9f6 100644 --- a/caffe2/python/mkl/mkl_elementwise_add_op_test.py +++ b/caffe2/python/mkl/mkl_elementwise_add_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py index 71e0754a0214..3adec4848e50 100644 --- a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py +++ b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py index 01e8c9b5a925..01786d55c337 100644 --- a/caffe2/python/mkl/mkl_fc_op_test.py +++ b/caffe2/python/mkl/mkl_fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py index 7cabadfe1da0..85f5605e9676 100644 --- a/caffe2/python/mkl/mkl_fc_speed_test.py +++ b/caffe2/python/mkl/mkl_fc_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py index dbdf12c1aca4..26a9b7131b0b 100644 --- a/caffe2/python/mkl/mkl_fill_op_test.py +++ b/caffe2/python/mkl/mkl_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py index b733edaace1c..a56e9448317a 100644 --- a/caffe2/python/mkl/mkl_pool_op_test.py +++ b/caffe2/python/mkl/mkl_pool_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py index a0fa8ca6ece8..b25e0f915cc7 100644 --- a/caffe2/python/mkl/mkl_pool_speed_test.py +++ b/caffe2/python/mkl/mkl_pool_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py index 90e365da554b..76ec33bcbe91 100644 --- a/caffe2/python/mkl/mkl_relu_op_test.py +++ b/caffe2/python/mkl/mkl_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py index 4a5fad2b7b68..2ac9080ce670 100644 --- a/caffe2/python/mkl/mkl_sbn_op_test.py +++ b/caffe2/python/mkl/mkl_sbn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py index d37bef32b9b7..3b3b71d1c997 100644 --- a/caffe2/python/mkl/mkl_sbn_speed_test.py +++ b/caffe2/python/mkl/mkl_sbn_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py index 654008c67b7d..abdb0983778d 100644 --- a/caffe2/python/mkl/mkl_sigmoid_op_test.py +++ b/caffe2/python/mkl/mkl_sigmoid_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py index 4034705580d5..9a7310a484d1 100644 --- a/caffe2/python/mkl/mkl_speed_test.py +++ b/caffe2/python/mkl/mkl_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py index 1e4b5791b0b6..8af090f60d88 100644 --- a/caffe2/python/mkl/mkl_squeeze_op_test.py +++ b/caffe2/python/mkl/mkl_squeeze_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index c003e0e3b09b..3a88a3deeccc 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import copy from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py index 42e3269fc4d8..1ad209cdbdfd 100644 --- a/caffe2/python/mkl/rewrite_graph_test.py +++ b/caffe2/python/mkl/rewrite_graph_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py index 5d8f66500190..88fb3cc800ec 100644 --- a/caffe2/python/mkl_test_util.py +++ b/caffe2/python/mkl_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py index a26bf844f2de..a5a4865c0ec1 100644 --- a/caffe2/python/model_helper.py +++ b/caffe2/python/model_helper.py @@ -1,9 +1,9 @@ ## @package model_helper # Module caffe2.python.model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope, workspace from caffe2.python.helpers.db_input import db_input diff --git a/caffe2/python/model_helper_test.py b/caffe2/python/model_helper_test.py index fcccddf401db..1423e4a97733 100644 --- a/caffe2/python/model_helper_test.py +++ b/caffe2/python/model_helper_test.py @@ -1,6 +1,6 @@ """unittest for ModelHelper class""" -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py index 3b5ea4b64cba..ea83f96f7019 100644 --- a/caffe2/python/modeling/compute_histogram_for_blobs.py +++ b/caffe2/python/modeling/compute_histogram_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py index 6c3b59950898..4ce6bf11487a 100644 --- a/caffe2/python/modeling/compute_histogram_for_blobs_test.py +++ b/caffe2/python/modeling/compute_histogram_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py index 41b7f88d24eb..24ed7a7482c7 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs.py +++ b/caffe2/python/modeling/compute_norm_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema, muji from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py index 3fefce0c4420..1bf3dae0353f 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs_test.py +++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py index 9a3fbcc96954..588b4a827cb8 100644 --- a/caffe2/python/modeling/compute_statistics_for_blobs.py +++ b/caffe2/python/modeling/compute_statistics_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py index e880f3edacb1..bf75a1f7d149 100644 --- a/caffe2/python/modeling/compute_statistics_for_blobs_test.py +++ b/caffe2/python/modeling/compute_statistics_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py index 88daa226c887..061dfe33991b 100644 --- a/caffe2/python/modeling/get_entry_from_blobs.py +++ b/caffe2/python/modeling/get_entry_from_blobs.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py index 8f4fbb864be1..3ec146766f30 100644 --- a/caffe2/python/modeling/get_entry_from_blobs_test.py +++ b/caffe2/python/modeling/get_entry_from_blobs_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py index 1999ced9ba1b..b01bc2ba301f 100644 --- a/caffe2/python/modeling/gradient_clipping.py +++ b/caffe2/python/modeling/gradient_clipping.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py index ca5c2ba8e22b..0b0e962cb727 100644 --- a/caffe2/python/modeling/gradient_clipping_test.py +++ b/caffe2/python/modeling/gradient_clipping_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py index 2053d9e53976..b3e4b1a44dd7 100644 --- a/caffe2/python/modeling/initializers.py +++ b/caffe2/python/modeling/initializers.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.core import DataType, BlobReference, ScopedBlobReference from caffe2.python.modeling.parameter_info import ParameterInfo diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py index 0355d1871787..fad40c159b6e 100644 --- a/caffe2/python/modeling/initializers_test.py +++ b/caffe2/python/modeling/initializers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import brew, model_helper, workspace diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py index 0f0ac7535c88..e824c828e4bd 100644 --- a/caffe2/python/modeling/net_modifier.py +++ b/caffe2/python/modeling/net_modifier.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import abc import six diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py index 589aa51a7b1c..195048cf91e8 100644 --- a/caffe2/python/modeling/parameter_info.py +++ b/caffe2/python/modeling/parameter_info.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py index 77e5cbd3f8bc..a0174500a413 100644 --- a/caffe2/python/modeling/parameter_sharing.py +++ b/caffe2/python/modeling/parameter_sharing.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py index f616fc1ea6ed..d37e40880c02 100644 --- a/caffe2/python/modeling/parameter_sharing_test.py +++ b/caffe2/python/modeling/parameter_sharing_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, model_helper, scope from caffe2.python.modeling.parameter_sharing import ( diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py index 79f045879ebc..fa10bff7246b 100644 --- a/caffe2/python/models/__sym_init__.py +++ b/caffe2/python/models/__sym_init__.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py index 4b9a570de807..46a9b59f6627 100644 --- a/caffe2/python/models/download.py +++ b/caffe2/python/models/download.py @@ -1,9 +1,9 @@ ## @package download # Module caffe2.python.models.download -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os import sys diff --git a/caffe2/python/models/imagenet_trainer_test_utils.py b/caffe2/python/models/imagenet_trainer_test_utils.py index 59107336ccd6..fec7708ea150 100644 --- a/caffe2/python/models/imagenet_trainer_test_utils.py +++ b/caffe2/python/models/imagenet_trainer_test_utils.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import time diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py index 41ca087d9637..430d3d335e1e 100644 --- a/caffe2/python/models/resnet.py +++ b/caffe2/python/models/resnet.py @@ -1,9 +1,9 @@ ## @package resnet # Module caffe2.python.models.resnet -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import brew import logging diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py index ce542e8da046..38d87cefff05 100644 --- a/caffe2/python/models/resnet_test.py +++ b/caffe2/python/models/resnet_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py index 7b909697fb05..6fc9f8ece480 100644 --- a/caffe2/python/models/seq2seq/beam_search.py +++ b/caffe2/python/models/seq2seq/beam_search.py @@ -1,9 +1,9 @@ ## @package beam_search # Module caffe2.python.models.seq2seq.beam_search -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple from caffe2.python import core diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py index 0ee1f6e35ba0..c10d2f1ab4ed 100644 --- a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py +++ b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import os diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py index b2a50c4bd58b..5adabb86fadf 100644 --- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py +++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py @@ -1,9 +1,9 @@ ## @package seq2seq_model_helper # Module caffe2.python.models.seq2seq.seq2seq_model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py index 8095440f2e5a..b70b74d39dc9 100644 --- a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py +++ b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.models.seq2seq import seq2seq_model_helper from caffe2.python import scope, test_util diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py index d0702880c1ec..e1b4224ea4c8 100644 --- a/caffe2/python/models/seq2seq/seq2seq_util.py +++ b/caffe2/python/models/seq2seq/seq2seq_util.py @@ -2,10 +2,10 @@ # Module caffe2.python.examples.seq2seq_util """ A bunch of util functions to build Seq2Seq models with Caffe2.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import collections from future.utils import viewitems diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py index df68e3e30d7b..8080318da4d0 100644 --- a/caffe2/python/models/seq2seq/train.py +++ b/caffe2/python/models/seq2seq/train.py @@ -1,9 +1,9 @@ ## @package train # Module caffe2.python.models.seq2seq.train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import collections diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py index d2b6a4f6399f..7e77f623e553 100644 --- a/caffe2/python/models/seq2seq/translate.py +++ b/caffe2/python/models/seq2seq/translate.py @@ -1,9 +1,9 @@ ## @package translate # Module caffe2.python.models.seq2seq.translate -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import ABCMeta, abstractmethod import argparse diff --git a/caffe2/python/models/shufflenet.py b/caffe2/python/models/shufflenet.py index c9075a4a1295..33a7f7a4b7c5 100644 --- a/caffe2/python/models/shufflenet.py +++ b/caffe2/python/models/shufflenet.py @@ -1,9 +1,9 @@ # Module caffe2.python.models.shufflenet -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew diff --git a/caffe2/python/models/shufflenet_test.py b/caffe2/python/models/shufflenet_test.py index 344c720b3eb6..6ccfd0a83354 100644 --- a/caffe2/python/models/shufflenet_test.py +++ b/caffe2/python/models/shufflenet_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py index 008e651e41f7..b65d97587549 100644 --- a/caffe2/python/modifier_context.py +++ b/caffe2/python/modifier_context.py @@ -1,9 +1,9 @@ # @package modifier_context # Module caffe2.python.modifier_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + DEFAULT_MODIFIER = 'DEFAULT' diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py index f1af8c3eb521..70dcdec11a58 100644 --- a/caffe2/python/net_builder.py +++ b/caffe2/python/net_builder.py @@ -1,9 +1,9 @@ ## @package net_builder # Module caffe2.python.net_builder -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, context from caffe2.python.task import Task, TaskGroup diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py index 169419c5c17b..bef6caefac3d 100644 --- a/caffe2/python/net_builder_test.py +++ b/caffe2/python/net_builder_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace from caffe2.python.core import Plan, to_execution_step, Net diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py index 1fd0833a718f..b55699c1c095 100644 --- a/caffe2/python/net_drawer.py +++ b/caffe2/python/net_drawer.py @@ -1,9 +1,9 @@ ## @package net_drawer # Module caffe2.python.net_drawer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import json import logging diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py index 09cde6f76767..8e1d65c01ce7 100644 --- a/caffe2/python/net_printer.py +++ b/caffe2/python/net_printer.py @@ -1,9 +1,9 @@ ## @package net_printer # Module caffe2.python.net_printer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef from caffe2.python.checkpoint import Job diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py index bc086c3eee2a..e71a2b323dea 100644 --- a/caffe2/python/net_printer_test.py +++ b/caffe2/python/net_printer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import net_printer from caffe2.python.checkpoint import Job diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py index c2f1774c7b2b..2b83e0ec9358 100644 --- a/caffe2/python/nomnigraph.py +++ b/caffe2/python/nomnigraph.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import errno import os diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 6ff47c6d4c9a..3d9adc696486 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, test_util from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/nomnigraph_transformations.py b/caffe2/python/nomnigraph_transformations.py index f4bc2c68bbb6..570c743df152 100644 --- a/caffe2/python/nomnigraph_transformations.py +++ b/caffe2/python/nomnigraph_transformations.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from collections import defaultdict diff --git a/caffe2/python/nomnigraph_transformations_test.py b/caffe2/python/nomnigraph_transformations_test.py index 6c58691db277..adbfe1a4885a 100644 --- a/caffe2/python/nomnigraph_transformations_test.py +++ b/caffe2/python/nomnigraph_transformations_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python import test_util as tu diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py index 1d452c6cbe60..2ca147328c78 100644 --- a/caffe2/python/normalizer.py +++ b/caffe2/python/normalizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.normalizer -from __future__ import absolute_import, division, print_function, unicode_literals + class Normalizer(object): diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py index 57c1052103dc..a85b993b4502 100644 --- a/caffe2/python/normalizer_context.py +++ b/caffe2/python/normalizer_context.py @@ -1,9 +1,9 @@ # @package regularizer_context # Module caffe2.python.normalizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py index 1f4cb4896778..f0ce5099ea75 100644 --- a/caffe2/python/normalizer_test.py +++ b/caffe2/python/normalizer_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext from caffe2.python.normalizer import BatchNormalizer diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py index 21c1cb158da1..a840c6932123 100644 --- a/caffe2/python/numa_benchmark.py +++ b/caffe2/python/numa_benchmark.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py index 692f515abe87..aba6e420ed55 100644 --- a/caffe2/python/numa_test.py +++ b/caffe2/python/numa_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py index 684514d17268..cc3ca1718a5c 100644 --- a/caffe2/python/observer_test.py +++ b/caffe2/python/observer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 9fe7b23bb7ae..d0f768e42eeb 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -5,10 +5,10 @@ To run this, you will need to have Caffe2 installed as well. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import collections diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py index 27135b35763d..4a75068cfd03 100644 --- a/caffe2/python/onnx/backend_cpp_rep.py +++ b/caffe2/python/onnx/backend_cpp_rep.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.backend_rep_cpp -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from onnx.backend.base import BackendRep, namedtupledict diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py index 13feea3ac8c9..ab97fd562dc1 100644 --- a/caffe2/python/onnx/backend_rep.py +++ b/caffe2/python/onnx/backend_rep.py @@ -1,9 +1,9 @@ # @package onnx # Module caffe2.python.onnx.backend_rep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py index a30ebdfc3f54..126eef8a8470 100644 --- a/caffe2/python/onnx/bin/conversion.py +++ b/caffe2/python/onnx/bin/conversion.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.bin.conversion -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import json diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py index da72af2cc9b1..1bac8290464d 100644 --- a/caffe2/python/onnx/error.py +++ b/caffe2/python/onnx/error.py @@ -1,8 +1,8 @@ ## @package onnx # Module caffe2.python.onnx.error -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + class BaseException(Exception): pass class Unsupported(BaseException): pass diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index 0fc1c0328093..ee3c30949ff7 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -6,10 +6,10 @@ To run this, you will need to have Caffe2 installed as well. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import itertools import logging diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py index e1d56e1a6766..7f8f1a6d346a 100644 --- a/caffe2/python/onnx/helper.py +++ b/caffe2/python/onnx/helper.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from onnx.backend.base import namedtupledict diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py index 6bbd35cd434c..a04e7e4554b9 100644 --- a/caffe2/python/onnx/onnxifi.py +++ b/caffe2/python/onnx/onnxifi.py @@ -5,10 +5,10 @@ ONNXIFI a Caffe2 net """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py index a859b572bae6..7eafccaec9e4 100644 --- a/caffe2/python/onnx/test_onnxifi.py +++ b/caffe2/python/onnx/test_onnxifi.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import json import numpy as np diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py index e0a02b9d5d83..fd40910d9e70 100644 --- a/caffe2/python/onnx/tests/__init__.py +++ b/caffe2/python/onnx/tests/__init__.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index d909cf828042..d253b06658a3 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -1,10 +1,10 @@ # @package onnx # Module caffe2.python.onnx.tests.c2_ref_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import json import os diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py index 8fa128acd62b..86cdddcd1692 100644 --- a/caffe2/python/onnx/tests/conversion_test.py +++ b/caffe2/python/onnx/tests/conversion_test.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.tests.conversion_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import json import six diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py index e3682780cb04..9000ad94fd9b 100644 --- a/caffe2/python/onnx/tests/helper_test.py +++ b/caffe2/python/onnx/tests/helper_test.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.helper_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index ad7885fcda74..e4de0a19c07a 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -1,10 +1,10 @@ # @package onnx # Module caffe2.python.onnx.tests.onnx_backend_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py index 34f849400e30..d34d4a0e5287 100644 --- a/caffe2/python/onnx/tests/ssa_test.py +++ b/caffe2/python/onnx/tests/ssa_test.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.ssa_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import copy import onnx diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py index 1fec938c8e88..d224daf05ba3 100644 --- a/caffe2/python/onnx/tests/test_utils.py +++ b/caffe2/python/onnx/tests/test_utils.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.test_utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import unittest diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py index a311ec37dfdc..f03e3609fe8b 100644 --- a/caffe2/python/onnx/workspace.py +++ b/caffe2/python/onnx/workspace.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import uuid diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py index 6e08f920a422..3a1ebcd4ec67 100644 --- a/caffe2/python/operator_fp_exceptions_test.py +++ b/caffe2/python/operator_fp_exceptions_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index 6a7a5ca18ef3..132bee879f6d 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py index 4cb9a54ec664..265d783e6336 100644 --- a/caffe2/python/operator_test/adadelta_test.py +++ b/caffe2/python/operator_test/adadelta_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py index 5ed2d0287e63..55e2f570cf07 100644 --- a/caffe2/python/operator_test/adagrad_test.py +++ b/caffe2/python/operator_test/adagrad_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import functools diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py index 891361e3a879..0fe4aa21f5f9 100644 --- a/caffe2/python/operator_test/adagrad_test_helper.py +++ b/caffe2/python/operator_test/adagrad_test_helper.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from functools import partial diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py index 0d188abc52be..2fb13c149922 100644 --- a/caffe2/python/operator_test/adam_test.py +++ b/caffe2/python/operator_test/adam_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py index 7e37216b82c1..76b09fdd5cd6 100644 --- a/caffe2/python/operator_test/affine_channel_op_test.py +++ b/caffe2/python/operator_test/affine_channel_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py index b7a50ab98e87..1ca26bf64f31 100644 --- a/caffe2/python/operator_test/apmeter_test.py +++ b/caffe2/python/operator_test/apmeter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py index ce800636e6e6..330d17ed6999 100644 --- a/caffe2/python/operator_test/arg_ops_test.py +++ b/caffe2/python/operator_test/arg_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py index e3474c0da7a4..2bbca5ab7376 100644 --- a/caffe2/python/operator_test/assert_test.py +++ b/caffe2/python/operator_test/assert_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, settings diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py index 753e76f15319..88e38df52da5 100644 --- a/caffe2/python/operator_test/atomic_ops_test.py +++ b/caffe2/python/operator_test/atomic_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py index 516c066c6ed8..e863289d488c 100644 --- a/caffe2/python/operator_test/basic_rnn_test.py +++ b/caffe2/python/operator_test/basic_rnn_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core, rnn_cell from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py index 19186220159c..c9306ce1ab07 100644 --- a/caffe2/python/operator_test/batch_box_cox_test.py +++ b/caffe2/python/operator_test/batch_box_cox_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py index fb13b0c08933..82def0572686 100644 --- a/caffe2/python/operator_test/batch_bucketize_op_test.py +++ b/caffe2/python/operator_test/batch_bucketize_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py index c3ee8750225b..12dd72a4160a 100644 --- a/caffe2/python/operator_test/batch_moments_op_test.py +++ b/caffe2/python/operator_test/batch_moments_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py index ef59ed23888f..adfc735c66fd 100644 --- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py +++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py index f1ee07c0d1e3..d2584f18af40 100644 --- a/caffe2/python/operator_test/bbox_transform_test.py +++ b/caffe2/python/operator_test/bbox_transform_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py index 77faeaeeb608..147a41282505 100644 --- a/caffe2/python/operator_test/bisect_percentile_op_test.py +++ b/caffe2/python/operator_test/bisect_percentile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py index 6e4c25c77c78..6cf8170b34f8 100644 --- a/caffe2/python/operator_test/blobs_queue_db_test.py +++ b/caffe2/python/operator_test/blobs_queue_db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 9ccaeaf9e7a7..05b8212242e4 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py index e3bc9f248d3a..8cba2aecf1a4 100644 --- a/caffe2/python/operator_test/boolean_unmask_test.py +++ b/caffe2/python/operator_test/boolean_unmask_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py index bfbe9b7396fa..3131316feefd 100644 --- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py +++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py index d1cd6ada7f55..bf9af112a5b0 100644 --- a/caffe2/python/operator_test/bucketize_op_test.py +++ b/caffe2/python/operator_test/bucketize_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dyndep from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py index f7ffb5b45b47..bf2a210086e6 100644 --- a/caffe2/python/operator_test/cast_op_test.py +++ b/caffe2/python/operator_test/cast_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 4e30c915ce2a..e8ee47702445 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py index 7d614047f48d..7adc5ce24fb7 100644 --- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py +++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py index d420484bac6b..b821e7b6a43c 100644 --- a/caffe2/python/operator_test/channel_shuffle_test.py +++ b/caffe2/python/operator_test/channel_shuffle_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py index cbef433ae0d3..72eedc479dd6 100644 --- a/caffe2/python/operator_test/channel_stats_op_test.py +++ b/caffe2/python/operator_test/channel_stats_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py index 7449ab61f32d..3042e5989764 100644 --- a/caffe2/python/operator_test/checkpoint_test.py +++ b/caffe2/python/operator_test/checkpoint_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, test_util import os diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py index c2d9809c8d80..3304121aab08 100644 --- a/caffe2/python/operator_test/clip_op_test.py +++ b/caffe2/python/operator_test/clip_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py index ee5bd8f73eb3..efc86815bc49 100644 --- a/caffe2/python/operator_test/clip_tensor_op_test.py +++ b/caffe2/python/operator_test/clip_tensor_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py index b5d726d449fc..28e6cd3b3df6 100644 --- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py +++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index bbe0e8eda1c1..1927b4eac78f 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py index 88d8fd8b7a27..2e214f089a45 100644 --- a/caffe2/python/operator_test/conditional_test.py +++ b/caffe2/python/operator_test/conditional_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py index ccd78eea4aa3..a240e98fc51e 100644 --- a/caffe2/python/operator_test/conftest.py +++ b/caffe2/python/operator_test/conftest.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 3e24e05191ac..ae54cd37a91d 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import collections import functools diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py index 6bed93226f5b..4fcb6361d0a6 100644 --- a/caffe2/python/operator_test/conv_transpose_test.py +++ b/caffe2/python/operator_test/conv_transpose_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py index 4efec570e812..2b8b756cdf61 100644 --- a/caffe2/python/operator_test/copy_ops_test.py +++ b/caffe2/python/operator_test/copy_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py index 9024ee3edfd1..8e914259bb78 100644 --- a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py +++ b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import logging diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index 1124df94e67a..04bfbbe6f4f6 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py index 3ebe26415622..d57ff31508c6 100644 --- a/caffe2/python/operator_test/counter_ops_test.py +++ b/caffe2/python/operator_test/counter_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py index d9eb89fc3352..b75e7b7b1a10 100644 --- a/caffe2/python/operator_test/crf_test.py +++ b/caffe2/python/operator_test/crf_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, crf, brew from caffe2.python.model_helper import ModelHelper import numpy as np diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py index 25dc6791fa12..d1852e7dd9e8 100644 --- a/caffe2/python/operator_test/cross_entropy_ops_test.py +++ b/caffe2/python/operator_test/cross_entropy_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 21ca68fe007a..1dda7166e65a 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.test_util import caffe2_flaky diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py index 0fd38a82b403..8bc7eb47d488 100644 --- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py index 5de901026eb6..db1b826cfe41 100644 --- a/caffe2/python/operator_test/cudnn_recurrent_test.py +++ b/caffe2/python/operator_test/cudnn_recurrent_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import model_helper, workspace, core, rnn_cell from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/data_couple_op_test.py b/caffe2/python/operator_test/data_couple_op_test.py index 32cf21e81bbf..d840207159b2 100644 --- a/caffe2/python/operator_test/data_couple_op_test.py +++ b/caffe2/python/operator_test/data_couple_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py index 138ac90e68c8..96d93dc5effb 100644 --- a/caffe2/python/operator_test/dataset_ops_test.py +++ b/caffe2/python/operator_test/dataset_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace, dataset from caffe2.python.dataset import Const diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py index 31e407499063..f6ad0e38e73c 100644 --- a/caffe2/python/operator_test/deform_conv_test.py +++ b/caffe2/python/operator_test/deform_conv_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import os import unittest diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py index aea30d890416..8b6f42417fd4 100644 --- a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py +++ b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py index af431f1f07d4..2d6d6429f833 100644 --- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py +++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/detectron_keypoints.py b/caffe2/python/operator_test/detectron_keypoints.py index 2f34349beae4..1abff0675993 100644 --- a/caffe2/python/operator_test/detectron_keypoints.py +++ b/caffe2/python/operator_test/detectron_keypoints.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + try: import cv2 diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index 753b94d20f1f..e948fdae9673 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index c8c46127e4d9..84c2f7e35f56 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import assume, given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py index 385e69fded4c..179b42dbabc8 100644 --- a/caffe2/python/operator_test/duplicate_operands_test.py +++ b/caffe2/python/operator_test/duplicate_operands_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index 8c7df5f33625..ac0dc3dd0975 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py index e35b4a483c6d..3195d969dee5 100644 --- a/caffe2/python/operator_test/elementwise_logical_ops_test.py +++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index ef9c1b9c8cf3..605c1d741271 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index ca2b847f088c..ed7a09eb0857 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py index a04e9d0e161d..0f728b723163 100644 --- a/caffe2/python/operator_test/emptysample_ops_test.py +++ b/caffe2/python/operator_test/emptysample_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py index c8c12e240946..b843bfdc95b9 100644 --- a/caffe2/python/operator_test/enforce_finite_op_test.py +++ b/caffe2/python/operator_test/enforce_finite_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import numpy as np diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py index 8d3c638e1ba1..a89718745b1c 100644 --- a/caffe2/python/operator_test/ensure_clipped_test.py +++ b/caffe2/python/operator_test/ensure_clipped_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py index 509c28a5a8bb..4812ee3042e0 100644 --- a/caffe2/python/operator_test/ensure_cpu_output_op_test.py +++ b/caffe2/python/operator_test/ensure_cpu_output_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import numpy as np diff --git a/caffe2/python/operator_test/erf_op_test.py b/caffe2/python/operator_test/erf_op_test.py index 5761c8409bd3..64714db4315c 100644 --- a/caffe2/python/operator_test/erf_op_test.py +++ b/caffe2/python/operator_test/erf_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import math diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index 4be96208fbba..0d198b1aff14 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py index c08596f8717d..1e8b5522053d 100644 --- a/caffe2/python/operator_test/fc_operator_test.py +++ b/caffe2/python/operator_test/fc_operator_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py index 1d64b19b993f..19fa329c9389 100644 --- a/caffe2/python/operator_test/feature_maps_ops_test.py +++ b/caffe2/python/operator_test/feature_maps_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, dyndep from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py index 4a2d9419d7bc..e080dde3eb5f 100644 --- a/caffe2/python/operator_test/filler_ops_test.py +++ b/caffe2/python/operator_test/filler_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py index c6d2856c3514..fc25913d8744 100644 --- a/caffe2/python/operator_test/find_op_test.py +++ b/caffe2/python/operator_test/find_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py index 19d204e0bded..2e0340c68779 100644 --- a/caffe2/python/operator_test/flatten_op_test.py +++ b/caffe2/python/operator_test/flatten_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import numpy as np diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py index 9542ecd30691..3e0e5722b0ce 100644 --- a/caffe2/python/operator_test/flexible_top_k_test.py +++ b/caffe2/python/operator_test/flexible_top_k_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py index 5877cb6da4e8..8c0974bb8579 100644 --- a/caffe2/python/operator_test/floor_op_test.py +++ b/caffe2/python/operator_test/floor_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py index ecabe7d29ef0..12d0b0265afb 100644 --- a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py +++ b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import math import struct diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py index 09225385191a..e9af40a128a6 100644 --- a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py +++ b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py index 967131de38d8..fc23be13fdae 100644 --- a/caffe2/python/operator_test/gather_ops_test.py +++ b/caffe2/python/operator_test/gather_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py index 19d538c60556..c0d73af33601 100644 --- a/caffe2/python/operator_test/gather_ranges_op_test.py +++ b/caffe2/python/operator_test/gather_ranges_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py index 3b1b4bf86515..7dea8f308783 100644 --- a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py +++ b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py index bcd277cf258b..3d929ce5c0ee 100644 --- a/caffe2/python/operator_test/given_tensor_fill_op_test.py +++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index f70c0739ded8..f38df09ec9fb 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py index 1d46888e791a..62aba236d5ba 100644 --- a/caffe2/python/operator_test/group_conv_test.py +++ b/caffe2/python/operator_test/group_conv_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py index d17998c32986..14300beed3f9 100644 --- a/caffe2/python/operator_test/group_norm_op_test.py +++ b/caffe2/python/operator_test/group_norm_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 99da7a3f5626..99444f39ac26 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core, scope, gru_cell from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py index ae8c1dc22799..e683a04d7998 100644 --- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py +++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import torch diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py index f2321adc8e01..245bca210ad9 100644 --- a/caffe2/python/operator_test/hsm_test.py +++ b/caffe2/python/operator_test/hsm_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import numpy as np import unittest diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index 9fdf0cabb0bd..90a8197e7ccf 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py index 98e9d61b5bd0..760228382bc6 100644 --- a/caffe2/python/operator_test/im2col_col2im_test.py +++ b/caffe2/python/operator_test/im2col_col2im_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py index 79acc60739f1..0de1f0ad048b 100644 --- a/caffe2/python/operator_test/image_input_op_test.py +++ b/caffe2/python/operator_test/image_input_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest try: diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py index f7c6d0cdc14a..1eb7ffa20691 100644 --- a/caffe2/python/operator_test/index_hash_ops_test.py +++ b/caffe2/python/operator_test/index_hash_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py index 642f340fad80..cf021f59362b 100644 --- a/caffe2/python/operator_test/index_ops_test.py +++ b/caffe2/python/operator_test/index_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index e57b8a8e11d8..fb4f3c935ba8 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py index 212f807addcf..79d79ae6de21 100644 --- a/caffe2/python/operator_test/integral_image_ops_test.py +++ b/caffe2/python/operator_test/integral_image_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 51faa14b9029..6ed2db2e88c2 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py index be38ee38926f..18fddff58d17 100644 --- a/caffe2/python/operator_test/key_split_ops_test.py +++ b/caffe2/python/operator_test/key_split_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py index e2f02b29d26f..6f976520e06b 100644 --- a/caffe2/python/operator_test/lars_test.py +++ b/caffe2/python/operator_test/lars_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 89ba4b2017bd..56cd72d69991 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, core, workspace from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py index 2eaa782eeefd..9a888cac7901 100644 --- a/caffe2/python/operator_test/leaky_relu_test.py +++ b/caffe2/python/operator_test/leaky_relu_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, assume diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py index 3a5d44663771..1891171b80d8 100644 --- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py +++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py index 1a1f9eb8c842..bdce6a4c78f7 100644 --- a/caffe2/python/operator_test/learning_rate_op_test.py +++ b/caffe2/python/operator_test/learning_rate_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py index fa3ac0826230..28d7134ac5e8 100644 --- a/caffe2/python/operator_test/length_split_op_test.py +++ b/caffe2/python/operator_test/length_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index d9cd2b244604..626ec0542b7d 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py index 88c99c3da337..fc4e89e2545b 100644 --- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py +++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index 4a9a6b0ff1a9..e0a5f9609588 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py index 8bc27c31144f..b8b082a02125 100644 --- a/caffe2/python/operator_test/lengths_top_k_ops_test.py +++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py index 8f4f680de109..c08f1180a920 100644 --- a/caffe2/python/operator_test/listwise_l2r_operator_test.py +++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py index a5e28479cf10..845bafee4702 100644 --- a/caffe2/python/operator_test/load_save_test.py +++ b/caffe2/python/operator_test/load_save_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import errno import hypothesis.strategies as st from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py index cfd49b8a7eb8..6eb3181ea9ad 100644 --- a/caffe2/python/operator_test/locally_connected_op_test.py +++ b/caffe2/python/operator_test/locally_connected_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, settings, assume diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index e57bdb7a1d41..24cb65ac96f8 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py index 1fcacc4f26f8..3a58cbe6d960 100644 --- a/caffe2/python/operator_test/lpnorm_op_test.py +++ b/caffe2/python/operator_test/lpnorm_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py index add86a3a467e..dcc8b295f7c3 100644 --- a/caffe2/python/operator_test/map_ops_test.py +++ b/caffe2/python/operator_test/map_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import itertools import numpy as np diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py index 354aed27aaf4..e28dd1ce28f8 100644 --- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py +++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py index e18025ffb92d..4849b83648f8 100644 --- a/caffe2/python/operator_test/math_ops_test.py +++ b/caffe2/python/operator_test/math_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index fababb13c54a..b8cef19b24df 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import inspect diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index 77c6b82625b1..5830089f8e9b 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py index 9f3302c6e75a..36b765557505 100644 --- a/caffe2/python/operator_test/merge_id_lists_op_test.py +++ b/caffe2/python/operator_test/merge_id_lists_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py index b72848b9a422..595debf977fe 100644 --- a/caffe2/python/operator_test/mkl_conv_op_test.py +++ b/caffe2/python/operator_test/mkl_conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py index 59546d3891e9..2f889d693444 100644 --- a/caffe2/python/operator_test/mkl_packed_fc_op_test.py +++ b/caffe2/python/operator_test/mkl_packed_fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py index 92a318f3f10f..914bffd2067c 100644 --- a/caffe2/python/operator_test/mod_op_test.py +++ b/caffe2/python/operator_test/mod_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index ae9d9158f506..3b270df254ce 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py index a37e27141bd0..58f16e87a21c 100644 --- a/caffe2/python/operator_test/momentum_sgd_test.py +++ b/caffe2/python/operator_test/momentum_sgd_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py index 0885289c7c1a..bb111a125fc0 100644 --- a/caffe2/python/operator_test/mpi_test.py +++ b/caffe2/python/operator_test/mpi_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mul_gradient_benchmark.py b/caffe2/python/operator_test/mul_gradient_benchmark.py index 721676239409..2e11aefcb497 100644 --- a/caffe2/python/operator_test/mul_gradient_benchmark.py +++ b/caffe2/python/operator_test/mul_gradient_benchmark.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import numpy as np diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py index 14ca954d363f..137be1eece34 100644 --- a/caffe2/python/operator_test/negate_gradient_op_test.py +++ b/caffe2/python/operator_test/negate_gradient_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py index 70aad5cab814..3f4e57fa230b 100644 --- a/caffe2/python/operator_test/ngram_ops_test.py +++ b/caffe2/python/operator_test/ngram_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py index 46f88a1de079..7a35e0bafa31 100644 --- a/caffe2/python/operator_test/normalize_op_test.py +++ b/caffe2/python/operator_test/normalize_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import functools diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index 398b0d4b93ab..a202581f808c 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py index d0b7a08ee706..593d5b5aa58c 100644 --- a/caffe2/python/operator_test/one_hot_ops_test.py +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 811e38e34af7..4cff53b87d6e 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py index 3777fdd7695d..7b3f40a27c97 100644 --- a/caffe2/python/operator_test/order_switch_test.py +++ b/caffe2/python/operator_test/order_switch_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py index 84f3f46a6dc1..698fbb76df88 100644 --- a/caffe2/python/operator_test/pack_ops_test.py +++ b/caffe2/python/operator_test/pack_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index 6bf2315ca0c5..9a76e6b847a5 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index 43cd10c23188..6d4e6bbdcd08 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py index a5a7db12b1ef..b600c302d83b 100644 --- a/caffe2/python/operator_test/partition_ops_test.py +++ b/caffe2/python/operator_test/partition_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace from caffe2.python.test_util import TestCase, rand_array diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py index 54c42bf63917..d81b0a963185 100644 --- a/caffe2/python/operator_test/percentile_op_test.py +++ b/caffe2/python/operator_test/percentile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, dyndep import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py index 463380306ce4..d7c4e0df4416 100644 --- a/caffe2/python/operator_test/piecewise_linear_transform_test.py +++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py index 743cee5cef3c..7ef98249bd79 100644 --- a/caffe2/python/operator_test/pooling_test.py +++ b/caffe2/python/operator_test/pooling_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py index 6cf8e7a81b5e..d794ba2162b9 100644 --- a/caffe2/python/operator_test/prepend_dim_test.py +++ b/caffe2/python/operator_test/prepend_dim_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py index 7467c8c3900c..b071070151d1 100644 --- a/caffe2/python/operator_test/python_op_test.py +++ b/caffe2/python/operator_test/python_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.core import CreatePythonOperator import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/quantile_test.py b/caffe2/python/operator_test/quantile_test.py index 6a4250d06183..39f3728d8e81 100644 --- a/caffe2/python/operator_test/quantile_test.py +++ b/caffe2/python/operator_test/quantile_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/operator_test/rand_quantization_op_speed_test.py b/caffe2/python/operator_test/rand_quantization_op_speed_test.py index ce0e84028541..1c56faff645f 100644 --- a/caffe2/python/operator_test/rand_quantization_op_speed_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_speed_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import time diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py index 811a20505a3c..e244f77149e1 100644 --- a/caffe2/python/operator_test/rand_quantization_op_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import struct diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py index 94220d76762d..2d52da293127 100644 --- a/caffe2/python/operator_test/rank_loss_operator_test.py +++ b/caffe2/python/operator_test/rank_loss_operator_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py index 930fad30d663..53d3fd4f4ecc 100644 --- a/caffe2/python/operator_test/rebatching_queue_test.py +++ b/caffe2/python/operator_test/rebatching_queue_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py index d32b3e794ab4..00e47ed1cb68 100644 --- a/caffe2/python/operator_test/record_queue_test.py +++ b/caffe2/python/operator_test/record_queue_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.dataset import Dataset from caffe2.python.schema import ( diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py index 24bd0122f4fb..5d9b83604423 100644 --- a/caffe2/python/operator_test/recurrent_net_executor_test.py +++ b/caffe2/python/operator_test/recurrent_net_executor_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import model_helper, workspace, core, rnn_cell, test_util diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py index 7cf79edfafed..13650e6cad4e 100644 --- a/caffe2/python/operator_test/recurrent_network_test.py +++ b/caffe2/python/operator_test/recurrent_network_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import recurrent, workspace from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py index ffb5e8a02667..727631befe89 100644 --- a/caffe2/python/operator_test/reduce_ops_test.py +++ b/caffe2/python/operator_test/reduce_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py index 018024900281..7d4287df6609 100644 --- a/caffe2/python/operator_test/reduction_ops_test.py +++ b/caffe2/python/operator_test/reduction_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py index 9c57ed4f3090..a42f00bbf82f 100644 --- a/caffe2/python/operator_test/reshape_ops_test.py +++ b/caffe2/python/operator_test/reshape_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import six from numpy.testing import assert_array_equal diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py index 893e09cf6443..cd90656f607d 100644 --- a/caffe2/python/operator_test/resize_op_test.py +++ b/caffe2/python/operator_test/resize_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py index 856832c34b99..084d7402df5f 100644 --- a/caffe2/python/operator_test/rmac_regions_op_test.py +++ b/caffe2/python/operator_test/rmac_regions_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/rms_norm_op_test.py b/caffe2/python/operator_test/rms_norm_op_test.py index f5a35701877c..797b3c9a01c3 100644 --- a/caffe2/python/operator_test/rms_norm_op_test.py +++ b/caffe2/python/operator_test/rms_norm_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py index 64cd7bf48913..8fe037ccb70c 100644 --- a/caffe2/python/operator_test/rnn_cell_test.py +++ b/caffe2/python/operator_test/rnn_cell_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import ( core, gradient_checker, rnn_cell, workspace, scope, utils diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py index 0487d962e6fb..c74157a039b0 100644 --- a/caffe2/python/operator_test/roi_align_rotated_op_test.py +++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/rowwise_counter_test.py b/caffe2/python/operator_test/rowwise_counter_test.py index a00dd24b3f2c..a9dacc5a6d86 100644 --- a/caffe2/python/operator_test/rowwise_counter_test.py +++ b/caffe2/python/operator_test/rowwise_counter_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/operator_test/scale_op_test.py b/caffe2/python/operator_test/scale_op_test.py index 14e17dc2c5d5..b5507e2013fa 100644 --- a/caffe2/python/operator_test/scale_op_test.py +++ b/caffe2/python/operator_test/scale_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py index 01c415eac953..f991a7dde211 100644 --- a/caffe2/python/operator_test/segment_ops_test.py +++ b/caffe2/python/operator_test/segment_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from functools import partial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py index fc903f159a4e..4dd2fa1848bf 100644 --- a/caffe2/python/operator_test/selu_op_test.py +++ b/caffe2/python/operator_test/selu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 720bf9f02030..4609473f91f0 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from functools import partial diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py index aca6ff38a517..702effc226d6 100644 --- a/caffe2/python/operator_test/shape_inference_test.py +++ b/caffe2/python/operator_test/shape_inference_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py index a925783c206e..6e8cae62dbff 100644 --- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py +++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py index f0f6c22cd10b..3ae26de6b513 100644 --- a/caffe2/python/operator_test/softmax_ops_test.py +++ b/caffe2/python/operator_test/softmax_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py index ac28a1a9a51e..dd183b774f92 100644 --- a/caffe2/python/operator_test/softplus_op_test.py +++ b/caffe2/python/operator_test/softplus_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py index 14d637f50f41..2ba21bb6d44f 100644 --- a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py +++ b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py index 9bdae01d1318..f1f85b1f9bec 100644 --- a/caffe2/python/operator_test/sparse_gradient_checker_test.py +++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py index 74690c8a2c56..fb958492cfa9 100644 --- a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py +++ b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import datetime diff --git a/caffe2/python/operator_test/sparse_lp_regularizer_test.py b/caffe2/python/operator_test/sparse_lp_regularizer_test.py index b0d0b4b5c9b3..7ea32bd69a29 100644 --- a/caffe2/python/operator_test/sparse_lp_regularizer_test.py +++ b/caffe2/python/operator_test/sparse_lp_regularizer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis from hypothesis import given, settings, HealthCheck diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py index bd8dbd5f7b53..ecc4ae0c8d22 100644 --- a/caffe2/python/operator_test/sparse_normalize_test.py +++ b/caffe2/python/operator_test/sparse_normalize_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis from hypothesis import given, settings, HealthCheck diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py index 1cf243ed05c4..089174007b18 100644 --- a/caffe2/python/operator_test/sparse_ops_test.py +++ b/caffe2/python/operator_test/sparse_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.test_util import rand_array diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py index 03deb62d8513..41ec8808bb6a 100644 --- a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py +++ b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py index 1186161e5f46..35f7bd2a5e29 100644 --- a/caffe2/python/operator_test/spatial_bn_op_test.py +++ b/caffe2/python/operator_test/spatial_bn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import brew, core, utils, workspace diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py index fe768e193c88..4f1842ac4664 100644 --- a/caffe2/python/operator_test/specialized_segment_ops_test.py +++ b/caffe2/python/operator_test/specialized_segment_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py index 172c6cbafa16..5bd6cb1d08f8 100644 --- a/caffe2/python/operator_test/square_root_divide_op_test.py +++ b/caffe2/python/operator_test/square_root_divide_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from functools import partial diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py index edc36facb236..6114dfed3b10 100644 --- a/caffe2/python/operator_test/stats_ops_test.py +++ b/caffe2/python/operator_test/stats_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py index 0a42d5d23728..12a9e6826fd1 100644 --- a/caffe2/python/operator_test/stats_put_ops_test.py +++ b/caffe2/python/operator_test/stats_put_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/storm_test.py b/caffe2/python/operator_test/storm_test.py index 2ae402a8a290..c97f631d2160 100644 --- a/caffe2/python/operator_test/storm_test.py +++ b/caffe2/python/operator_test/storm_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py index 969e8c7e11e5..eedb57be1d6c 100644 --- a/caffe2/python/operator_test/string_ops_test.py +++ b/caffe2/python/operator_test/string_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py index 41ba814af6ab..8889ddb9f53c 100644 --- a/caffe2/python/operator_test/text_file_reader_test.py +++ b/caffe2/python/operator_test/text_file_reader_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.text_file_reader import TextFileReader from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py index 9c103c85c03c..0cd5c0f77895 100644 --- a/caffe2/python/operator_test/thresholded_relu_op_test.py +++ b/caffe2/python/operator_test/thresholded_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py index 51471f797b34..d39dfeee0ad7 100644 --- a/caffe2/python/operator_test/tile_op_test.py +++ b/caffe2/python/operator_test/tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py index 85cf902812ee..fa628456c3a4 100644 --- a/caffe2/python/operator_test/top_k_test.py +++ b/caffe2/python/operator_test/top_k_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index a1ddbaa9509e..55f26a89987f 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py index e4b739a741ac..4ccec250e22b 100644 --- a/caffe2/python/operator_test/transpose_op_test.py +++ b/caffe2/python/operator_test/transpose_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py index 5d57940dc33e..04b98857c301 100644 --- a/caffe2/python/operator_test/trigonometric_op_test.py +++ b/caffe2/python/operator_test/trigonometric_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py index 016554321983..b49f4765539e 100644 --- a/caffe2/python/operator_test/unique_ops_test.py +++ b/caffe2/python/operator_test/unique_ops_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py index f858e8fa06bd..1026745db724 100644 --- a/caffe2/python/operator_test/unique_uniform_fill_op_test.py +++ b/caffe2/python/operator_test/unique_uniform_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py index a56d1edebe68..61b01644bcf5 100644 --- a/caffe2/python/operator_test/upsample_op_test.py +++ b/caffe2/python/operator_test/upsample_op_test.py @@ -13,9 +13,9 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py index 2814d7a02775..241d1e4c1b56 100644 --- a/caffe2/python/operator_test/utility_ops_test.py +++ b/caffe2/python/operator_test/utility_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py index c06183c0f1bb..f21f219bd90e 100644 --- a/caffe2/python/operator_test/video_input_op_test.py +++ b/caffe2/python/operator_test/video_input_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import os import shutil diff --git a/caffe2/python/operator_test/weight_scale_test.py b/caffe2/python/operator_test/weight_scale_test.py index 9988ebc309d2..5cdc11eb4d11 100644 --- a/caffe2/python/operator_test/weight_scale_test.py +++ b/caffe2/python/operator_test/weight_scale_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py index 8b0966590594..830a9f9849c7 100644 --- a/caffe2/python/operator_test/weighted_multi_sample_test.py +++ b/caffe2/python/operator_test/weighted_multi_sample_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py index 24326d6337c4..032e9e9d755e 100644 --- a/caffe2/python/operator_test/weighted_sample_test.py +++ b/caffe2/python/operator_test/weighted_sample_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py index 4940bc69a052..2c7dffe92672 100644 --- a/caffe2/python/operator_test/weighted_sum_test.py +++ b/caffe2/python/operator_test/weighted_sum_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py index 2a48bed86358..48fe0f94731e 100644 --- a/caffe2/python/operator_test/wngrad_test.py +++ b/caffe2/python/operator_test/wngrad_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 21a61a93d00c..9a2f9f541420 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.optimizer -from __future__ import absolute_import, division, print_function, unicode_literals + import copy import logging diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py index 483f08dc5aff..d1593f440383 100644 --- a/caffe2/python/optimizer_context.py +++ b/caffe2/python/optimizer_context.py @@ -1,9 +1,9 @@ ## @package optimizer_context # Module caffe2.python.optimizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py index a45571f19683..90f0932d23f6 100644 --- a/caffe2/python/optimizer_test.py +++ b/caffe2/python/optimizer_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 import caffe2.python.optimizer as optimizer from caffe2.python.optimizer import ( diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index f7df35bfee70..02276b08c176 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -1,9 +1,9 @@ ## @package optimizer_test_util # Module caffe2.python.optimizer_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py index 224dbf66b6ce..4ee446610bdb 100644 --- a/caffe2/python/parallel_workers.py +++ b/caffe2/python/parallel_workers.py @@ -1,9 +1,9 @@ # @package parallel_workers # Module caffe2.python.parallel_workers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + ''' diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py index a3367e6ee351..a9a7c6a078d7 100644 --- a/caffe2/python/parallel_workers_test.py +++ b/caffe2/python/parallel_workers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py index b3647a2007f5..c38a4ccc34d7 100644 --- a/caffe2/python/parallelize_bmuf_distributed_test.py +++ b/caffe2/python/parallelize_bmuf_distributed_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from multiprocessing import Process, Manager diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py index 5b30da4387f3..4625d0b0458c 100644 --- a/caffe2/python/pipeline.py +++ b/caffe2/python/pipeline.py @@ -1,9 +1,9 @@ ## @package pipeline # Module caffe2.python.pipeline -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, queue_util from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py index 5f57355b25d3..fe00933ac4e1 100644 --- a/caffe2/python/pipeline_test.py +++ b/caffe2/python/pipeline_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import ( Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord) diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py index 7eea50464504..e0fa90bffb6e 100644 --- a/caffe2/python/predictor/mobile_exporter.py +++ b/caffe2/python/predictor/mobile_exporter.py @@ -1,10 +1,10 @@ ## @package mobile_exporter # Module caffe2.python.mobile_exporter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, utils from caffe2.proto import caffe2_pb2 import numpy as np diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py index 1c4cf77ea051..0269ec229888 100644 --- a/caffe2/python/predictor/mobile_exporter_test.py +++ b/caffe2/python/predictor/mobile_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.test_util import TestCase from caffe2.python import workspace, brew from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py index e9759862fcb5..c8c68f9f30a0 100644 --- a/caffe2/python/predictor/predictor_exporter.py +++ b/caffe2/python/predictor/predictor_exporter.py @@ -1,9 +1,9 @@ ## @package predictor_exporter # Module caffe2.python.predictor.predictor_exporter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.proto import metanet_pb2 diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py index 9c8b16c30705..2a0685fb955c 100644 --- a/caffe2/python/predictor/predictor_exporter_test.py +++ b/caffe2/python/predictor/predictor_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import tempfile import unittest diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py index 1af5923952dc..cc831454a08c 100644 --- a/caffe2/python/predictor/predictor_py_utils.py +++ b/caffe2/python/predictor/predictor_py_utils.py @@ -1,9 +1,9 @@ ## @package predictor_py_utils # Module caffe2.python.predictor.predictor_py_utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py index 26c4cae63b57..64c88006686c 100644 --- a/caffe2/python/predictor/predictor_test.py +++ b/caffe2/python/predictor/predictor_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py index af48b2920a87..2b8f1544803d 100644 --- a/caffe2/python/predictor/serde.py +++ b/caffe2/python/predictor/serde.py @@ -1,9 +1,9 @@ ## @package serde # Module caffe2.python.predictor.serde -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def serialize_protobuf_struct(protobuf_struct): diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py index c1e1dedb8b09..eda0c66974f4 100644 --- a/caffe2/python/predictor_constants.py +++ b/caffe2/python/predictor_constants.py @@ -1,9 +1,9 @@ ## @package predictor_constants # Module caffe2.python.predictor_constants -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.proto.predictor_consts_pb2 as predictor_consts predictor_constants = predictor_consts.PredictorConsts() diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py index 5a8cfe4a9b46..893671b96f45 100644 --- a/caffe2/python/python_op_test.py +++ b/caffe2/python/python_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.core import CreatePythonOperator import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py index 62265758c2f2..c9a91fc27d17 100644 --- a/caffe2/python/queue_util.py +++ b/caffe2/python/queue_util.py @@ -1,9 +1,9 @@ ## @package queue_util # Module caffe2.python.queue_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dataio from caffe2.python.task import TaskGroup diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py index d5f129a2f902..1170c2bf3a82 100644 --- a/caffe2/python/record_queue.py +++ b/caffe2/python/record_queue.py @@ -3,10 +3,10 @@ """ Implementation of a queue wrapper. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py index e5b48894efbc..d4762f08c683 100644 --- a/caffe2/python/recurrent.py +++ b/caffe2/python/recurrent.py @@ -1,9 +1,9 @@ ## @package recurrent # Module caffe2.python.recurrent -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from future.utils import viewitems, viewkeys diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py index e994de8b0c44..4042149ca80c 100644 --- a/caffe2/python/regularizer.py +++ b/caffe2/python/regularizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.regularizer -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.python import core, utils import numpy as np diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py index 6935fdcb47c0..5d79e138b6b7 100644 --- a/caffe2/python/regularizer_context.py +++ b/caffe2/python/regularizer_context.py @@ -1,9 +1,9 @@ # @package regularizer_context # Module caffe2.python.regularizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py index 2018040433b4..685feaf93ed2 100644 --- a/caffe2/python/regularizer_test.py +++ b/caffe2/python/regularizer_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py index a37eb20fda26..3f2ff2d6cc8f 100644 --- a/caffe2/python/rnn/__init__.py +++ b/caffe2/python/rnn/__init__.py @@ -1,5 +1,5 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py index c3bf9b30cea7..dee96413dbe5 100644 --- a/caffe2/python/rnn/lstm_comparison.py +++ b/caffe2/python/rnn/lstm_comparison.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, lstm_benchmark, utils from copy import copy diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py index 1533c1e3d418..95728d682bfa 100644 --- a/caffe2/python/rnn/rnn_cell_test_util.py +++ b/caffe2/python/rnn/rnn_cell_test_util.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, scope from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py index 8192b34dc12e..e16bfaaf491e 100644 --- a/caffe2/python/rnn_cell.py +++ b/caffe2/python/rnn_cell.py @@ -1,9 +1,9 @@ ## @package rnn_cell # Module caffe2.python.rnn_cell -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools import inspect diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py index 50fe136a5a12..fb7cadf42847 100644 --- a/caffe2/python/schema.py +++ b/caffe2/python/schema.py @@ -13,10 +13,10 @@ walkthrough on how to use schema to store and iterate through a structured in-memory dataset. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py index 28bf5c64a428..dca19a127ef2 100644 --- a/caffe2/python/schema_test.py +++ b/caffe2/python/schema_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema import numpy as np diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py index be05aa468d10..11fddc7b0f62 100644 --- a/caffe2/python/scope.py +++ b/caffe2/python/scope.py @@ -1,9 +1,9 @@ ## @package scope # Module caffe2.python.scope -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import threading diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index b24fc6851428..9bd69eb32902 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope, core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/serialized_test/coverage.py b/caffe2/python/serialized_test/coverage.py index 7ba93f66af6b..2014847242c4 100644 --- a/caffe2/python/serialized_test/coverage.py +++ b/caffe2/python/serialized_test/coverage.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py index 30810d9d8283..621adca9454e 100644 --- a/caffe2/python/serialized_test/serialized_test_util.py +++ b/caffe2/python/serialized_test/serialized_test_util.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/session.py b/caffe2/python/session.py index 9059e1eabc94..de3b09931a30 100644 --- a/caffe2/python/session.py +++ b/caffe2/python/session.py @@ -1,9 +1,9 @@ ## @package session # Module caffe2.python.session -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py index ae5e50d23ec7..fa505c296820 100644 --- a/caffe2/python/session_test.py +++ b/caffe2/python/session_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import ( Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord) diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py index 375068ef537e..e62c7e6d41dc 100644 --- a/caffe2/python/sparse_to_dense_mask_test.py +++ b/caffe2/python/sparse_to_dense_mask_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py index 5e6d10823e5f..dc43d2c03394 100644 --- a/caffe2/python/sparse_to_dense_test.py +++ b/caffe2/python/sparse_to_dense_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/task.py b/caffe2/python/task.py index 9dcb211274b3..f1b25ee26092 100644 --- a/caffe2/python/task.py +++ b/caffe2/python/task.py @@ -1,9 +1,9 @@ ## @package task # Module caffe2.python.task -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, context from caffe2.python.schema import Field, from_blob_list diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py index f1c51bc5b442..c44e93a3704c 100644 --- a/caffe2/python/task_test.py +++ b/caffe2/python/task_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import task diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py index 66d6835c4814..37886618ef45 100644 --- a/caffe2/python/test/blob_deallocation_test.py +++ b/caffe2/python/test/blob_deallocation_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace import unittest diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py index 72e9f83c9540..fcc6918d5350 100644 --- a/caffe2/python/test/do_op_test.py +++ b/caffe2/python/test/do_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py index 84df86fb05b0..b4db64005f62 100644 --- a/caffe2/python/test/executor_test.py +++ b/caffe2/python/test/executor_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.python.test.executor_test_util import ( diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py index bf93c49d8cdc..ba10247eaa2e 100644 --- a/caffe2/python/test/executor_test_util.py +++ b/caffe2/python/test/executor_test_util.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import ( diff --git a/caffe2/python/test/fakefp16_transform_test.py b/caffe2/python/test/fakefp16_transform_test.py index d58d12ad60de..f98342eba54a 100644 --- a/caffe2/python/test/fakefp16_transform_test.py +++ b/caffe2/python/test/fakefp16_transform_test.py @@ -1,6 +1,6 @@ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + import unittest from caffe2.python.fakefp16_transform_lib import fakeFp16FuseOps diff --git a/caffe2/python/test/gpu_context_test.py b/caffe2/python/test/gpu_context_test.py index 741f39d6dc8a..9ee8a308cc2e 100644 --- a/caffe2/python/test/gpu_context_test.py +++ b/caffe2/python/test/gpu_context_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py index 817f5e21a563..7790e0f6d8f5 100644 --- a/caffe2/python/test/python_protobuf_test.py +++ b/caffe2/python/test/python_protobuf_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + # make sure we use cpp implementation of protobuf import os diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py index a2cf3aced07c..94ac41524065 100644 --- a/caffe2/python/test_util.py +++ b/caffe2/python/test_util.py @@ -1,9 +1,9 @@ ## @package test_util # Module caffe2.python.test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py index 52a1b274f086..48f69f90c7b4 100644 --- a/caffe2/python/text_file_reader.py +++ b/caffe2/python/text_file_reader.py @@ -1,9 +1,9 @@ ## @package text_file_reader # Module caffe2.python.text_file_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.dataio import Reader from caffe2.python.schema import Scalar, Struct, data_type_for_dtype diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py index 07226c128ffe..2314a3ad9c24 100644 --- a/caffe2/python/timeout_guard.py +++ b/caffe2/python/timeout_guard.py @@ -1,9 +1,9 @@ ## @package timeout_guard # Module caffe2.python.timeout_guard -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import threading diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py index ed0a32788de8..fc1bad34b201 100644 --- a/caffe2/python/transformations.py +++ b/caffe2/python/transformations.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python._import_c_extension as C diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 363ceb19619d..14b97e4939ef 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py index e95cb4bd46e3..39d37ca9fa0a 100644 --- a/caffe2/python/trt/test_trt.py +++ b/caffe2/python/trt/test_trt.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py index ce45ae3cb86d..0936941aac03 100644 --- a/caffe2/python/trt/transform.py +++ b/caffe2/python/trt/transform.py @@ -6,10 +6,10 @@ Note that ONNX-TRT enforce an NCHW input! """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py index a2011da16b15..314718b76c9d 100644 --- a/caffe2/python/tt_core.py +++ b/caffe2/python/tt_core.py @@ -1,8 +1,8 @@ ## @package tt_core # Module caffe2.python.tt_core -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py index aec5764e66e5..0cee3b254720 100644 --- a/caffe2/python/tt_core_test.py +++ b/caffe2/python/tt_core_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 9cf30d9c06b3..947dd9bf296d 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -1,9 +1,9 @@ # @package utils # Module caffe2.python.utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python.compatibility import container_abcs diff --git a/caffe2/python/utils_test.py b/caffe2/python/utils_test.py index 3921f3d67ca7..ef809bfd8154 100644 --- a/caffe2/python/utils_test.py +++ b/caffe2/python/utils_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, utils, test_util diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index f76fcf75a33a..99983e84f097 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -1,9 +1,9 @@ ## @package workspace # Module caffe2.python.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import collections import contextlib from google.protobuf.message import Message diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 7e64220f480e..86dbcf5d70ba 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import os diff --git a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py index 08f658ba9608..4f4bad64980c 100644 --- a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py +++ b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections from itertools import product diff --git a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py index 27a07ece62be..1d3fd2cc369d 100644 --- a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py +++ b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py index 82dd1772d5da..24a2269cc850 100644 --- a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py +++ b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/concat_dnnlowp_op_test.py b/caffe2/quantization/server/concat_dnnlowp_op_test.py index 777c523aff87..fc7e897993d4 100644 --- a/caffe2/quantization/server/concat_dnnlowp_op_test.py +++ b/caffe2/quantization/server/concat_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py index 70bcf53f44d4..a605ea3fc49e 100644 --- a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py index ae2f49cfe20c..68c14b69f058 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_dnnlowp_op_test.py b/caffe2/quantization/server/conv_dnnlowp_op_test.py index 682a4d787aba..11cd12a4d5bc 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py index 9ed9106db0be..715b6f8c01a8 100644 --- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py index 773253743c6d..99e914c294b9 100644 --- a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py index 399ae4363831..5694a553e744 100644 --- a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py +++ b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py index 1a41664cb2d1..0d56ea6ac127 100644 --- a/caffe2/quantization/server/dnnlowp_test_utils.py +++ b/caffe2/quantization/server/dnnlowp_test_utils.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py index 1cf65f37858a..75bd2f8e4d44 100644 --- a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py index 3f199f981331..af1cd0f80684 100644 --- a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py index b9104f598d08..e31b9d179071 100644 --- a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py index 9b3caf41ecc5..faf526b8c48d 100644 --- a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py index 68059421cfac..5d77eceb8e04 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py index b8c4a3e22812..f1939e198b84 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_fp16_test.py b/caffe2/quantization/server/fully_connected_fp16_test.py index 710207f7caeb..be1e2c8a1ab5 100644 --- a/caffe2/quantization/server/fully_connected_fp16_test.py +++ b/caffe2/quantization/server/fully_connected_fp16_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py index a4ba681867ff..284ae56d743e 100644 --- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/gather_dnnlowp_op_test.py b/caffe2/quantization/server/gather_dnnlowp_op_test.py index c1f495260722..c2c7f35a66d4 100644 --- a/caffe2/quantization/server/gather_dnnlowp_op_test.py +++ b/caffe2/quantization/server/gather_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py index 93a4163c86bb..30051d95b59c 100644 --- a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py +++ b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/int8_gen_quant_params_test.py b/caffe2/quantization/server/int8_gen_quant_params_test.py index f2c7fd81dabb..d208d6f9b575 100644 --- a/caffe2/quantization/server/int8_gen_quant_params_test.py +++ b/caffe2/quantization/server/int8_gen_quant_params_test.py @@ -13,7 +13,7 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py index f34081aeba24..70f9b0c2f1fa 100644 --- a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py +++ b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py @@ -13,7 +13,7 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu from caffe2.python import core, workspace diff --git a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py index 9cd22bd2c491..bcf06ce0274e 100644 --- a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py +++ b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/observer_test.py b/caffe2/quantization/server/observer_test.py index 4299c146b2da..5c2b28e5e6fb 100644 --- a/caffe2/quantization/server/observer_test.py +++ b/caffe2/quantization/server/observer_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/quantization/server/pool_dnnlowp_op_test.py b/caffe2/quantization/server/pool_dnnlowp_op_test.py index d581fbef00cd..fedc87ee732a 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op_test.py +++ b/caffe2/quantization/server/pool_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/quantize_dnnlowp_op_test.py b/caffe2/quantization/server/quantize_dnnlowp_op_test.py index caaf456fb84e..e61a28b4b930 100644 --- a/caffe2/quantization/server/quantize_dnnlowp_op_test.py +++ b/caffe2/quantization/server/quantize_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/relu_dnnlowp_op_test.py b/caffe2/quantization/server/relu_dnnlowp_op_test.py index 5e85b4e43ed6..68b5aed049f1 100644 --- a/caffe2/quantization/server/relu_dnnlowp_op_test.py +++ b/caffe2/quantization/server/relu_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py index 47ae47b81106..67017ee0afcc 100644 --- a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py +++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py index 6af92a5d2fe5..b12b3908aafa 100644 --- a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py +++ b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py index 28ff4a0a750b..836745dcf543 100644 --- a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py +++ b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py index b1d34c19d3ae..d7253b1675f4 100644 --- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py +++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/tanh_dnnlowp_op_test.py b/caffe2/quantization/server/tanh_dnnlowp_op_test.py index e0af7af62bba..f73befd25e26 100644 --- a/caffe2/quantization/server/tanh_dnnlowp_op_test.py +++ b/caffe2/quantization/server/tanh_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/utils.py b/caffe2/quantization/server/utils.py index 862ed5a9cd62..9e137cb5f6af 100644 --- a/caffe2/quantization/server/utils.py +++ b/caffe2/quantization/server/utils.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import copy import logging diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py index 0fac6d20d4d4..9121c5ebf0db 100644 --- a/scripts/get_python_cmake_flags.py +++ b/scripts/get_python_cmake_flags.py @@ -12,9 +12,9 @@ # make # -from __future__ import absolute_import -from __future__ import unicode_literals -from __future__ import print_function + + + from distutils import sysconfig import sys diff --git a/setup.py b/setup.py index 753e2b0f14a1..059188875e77 100644 --- a/setup.py +++ b/setup.py @@ -162,7 +162,7 @@ # When turned on, the following cmake variables will be toggled as well: # USE_SYSTEM_CPUINFO=ON USE_SYSTEM_SLEEF=ON BUILD_CUSTOM_PROTOBUF=OFF -from __future__ import print_function + import sys if sys.version_info < (3,): print("Python 2 has reached end-of-life and is no longer supported by PyTorch.") diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index acecbe737e6d..026293a9281a 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import absolute_import, division, print_function + import os import argparse import sys diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index e41c921f1e33..89bf64d8149e 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -22,7 +22,7 @@ # which will in turn dispatch back to VariableType for its # differentiable subcomponents. # -from __future__ import print_function + from .utils import CodeTemplate, nested_dict, write, uninplace_api_name from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \ MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index 354aedc601ad..f8e8e61857e5 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -12,7 +12,7 @@ glob or regular expressions. """ -from __future__ import print_function + import argparse import collections diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 4b91abf1c6c7..118a3e9b58b7 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import os import collections from pprint import pformat diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index d5db749d1552..83253cc3a526 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -1,6 +1,6 @@ "Manages CMake." -from __future__ import print_function + import multiprocessing import os From 721cfbf8425cf2c1dc5e27d1332e32e1a42ef541 Mon Sep 17 00:00:00 2001 From: Dianshi Li Date: Wed, 23 Sep 2020 18:30:17 -0700 Subject: [PATCH 072/449] [PT Model Split] Support 2 operators in PT by C2 conversion (#45231) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45231 There are two operators: `PriorCorrectionCalibrationPrediction` and `GatherRangesToDense` is not supported in PT which makes GLOW cannot work. To unblock, we first try to use C2->PT conversion. In the long-term, we need to implement PT custom ops. This diff does this conversion to unblock current project. Test Plan: Run unit test. the Test input is from current DPER example. All pass. ```buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_prior_correct_calibration_prediction_op --print-passing-details > c2 reference output > [0.14285715 0.27272728 0.39130434 0.5 ] > PT converted output > tensor([0.1429, 0.2727, 0.3913, 0.5000]) buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_gather_ranges_to_dense_op --print-passing-details c2 reference output > [array([[6, 5, 4, 3], [0, 0, 0, 0]], dtype=int64)] > PT converted output > [tensor([[6, 5, 4, 3], [0, 0, 0, 0]])] ``` Reviewed By: allwu, qizzzh Differential Revision: D23858329 fbshipit-source-id: ed37118ca7f09e1cd0ad1fdec3d37f66dce60dd9 --- caffe2/operators/gather_ranges_to_dense_op.cc | 8 +++ caffe2/operators/gather_ranges_to_dense_op.h | 3 + .../operator_test/torch_integration_test.py | 69 ++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc index 10396aafc97e..aa31ef12b36a 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.cc +++ b/caffe2/operators/gather_ranges_to_dense_op.cc @@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense); } // namespace } // namespace caffe2 + +using GatherRangesToDenseCPUOp = + caffe2::GatherRangesToDenseOp; + +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( + GatherRangesToDense, + "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs", + GatherRangesToDenseCPUOp); diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index c1dd5a527005..217a61b25129 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -5,6 +5,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" +#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -15,6 +16,8 @@ #include #include +C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense); + namespace caffe2 { template class GatherRangesToDenseOp final : public Operator { diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index 55f26a89987f..7194daa91203 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -7,10 +7,12 @@ import torch import unittest -from caffe2.python import core, workspace +from caffe2.python import core, dyndep, workspace from hypothesis import given, settings from scipy.stats import norm +dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/operators:calibration_op') + def generate_rois(roi_counts, im_dims): assert len(roi_counts) == len(im_dims) @@ -875,6 +877,71 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries): ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) + def test_gather_ranges_to_dense_op(self): + data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) + ranges = np.array([[[2, 4]], [[0, 0]]]) + key = np.array([0, 1, 3, 2, 1, 0, 1, 0]) + lengths = np.array([4]) + min_observation = 2 + max_mismatched_ratio = 0.5 + max_empty_ratio = 1.0 + + outputs_name = ["X_{}".format(i) for i in range(len(lengths))] + ref_op = core.CreateOperator( + "GatherRangesToDense", + ["data", "ranges", "key"], + outputs_name, + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + workspace.FeedBlob("data", data) + workspace.FeedBlob("ranges", ranges) + workspace.FeedBlob("key", key) + workspace.RunOperatorOnce(ref_op) + ref_outputs = [] + for output_name in outputs_name: + ref_outputs.append(workspace.FetchBlob(output_name)) + + outputs = torch.ops._caffe2.GatherRangesToDense( + torch.from_numpy(data), + torch.from_numpy(ranges), + torch.from_numpy(key), + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + + self.assertEqual(len(ref_outputs), len(outputs)) + for i in range(0, len(ref_outputs)): + np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy()) + + def test_prior_correct_calibration_prediction_op(self): + beta = np.array([1.0, 2.0], dtype=np.float32) + gamma = np.array([3.0, 4.0], dtype=np.float32) + pred = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32) + + ref_op = core.CreateOperator( + "PriorCorrectionCalibrationPrediction", + ["beta", "gamma", "pred"], + ["new_pred"], + ) + workspace.FeedBlob("beta", beta) + workspace.FeedBlob("gamma", gamma) + workspace.FeedBlob("pred", pred) + workspace.RunOperatorOnce(ref_op) + ref_output = workspace.FetchBlob("new_pred") + + output = torch.ops._caffe2.PriorCorrectionCalibrationPrediction( + torch.from_numpy(beta), + torch.from_numpy(gamma), + torch.from_numpy(pred), + ) + torch.testing.assert_allclose(ref_output, output) + + @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10)) @settings(deadline=1000) def test_merge_id_lists(self, lengths_0, lengths_1): From 60665ace17b918d6a0548ebc42b6c5bca9014b31 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Wed, 23 Sep 2020 18:56:07 -0700 Subject: [PATCH 073/449] [quant] Add optimized approach to calculate qparams for qembedding_bag (#45149) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45149 The choose_qparams_optimized calculates the the optimized qparams. It uses a greedy approach to nudge the min and max and calculate the l2 norm and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))` Test Plan: Imported from OSS Reviewed By: raghuramank100 Differential Revision: D23848060 fbshipit-source-id: c6c57c9bb07664c3f1c87dd7664543e09f634aee --- aten/src/ATen/native/native_functions.yaml | 4 + aten/src/ATen/native/quantized/QTensor.cpp | 88 +++++++++++++++++++ .../quantized/cpu/qembeddingbag_prepack.cpp | 57 ++++++------ aten/src/ATen/native/quantized/library.cpp | 4 +- test/quantization/test_quantized_op.py | 34 ++++--- tools/autograd/gen_python_functions.py | 1 + .../quantization/insert_quant_dequant.cpp | 5 +- torch/overrides.py | 1 + 8 files changed, 153 insertions(+), 41 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index c61f021f8c5f..ae6afc3818a5 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -4312,6 +4312,10 @@ use_c10_dispatcher: full variants: function +- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (float, float) + use_c10_dispatcher: full + variants: function + # to(Device) must not exist because all constructors of Device also works for # TensorOptions. Otherwise, an ambiguity error is thrown. # See NOTE [ TensorOptions Constructors ]. diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 5efec6420906..9db2a6eb2ac4 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -232,5 +232,93 @@ std::tuple _choose_qparams_per_tensor( return std::make_tuple(q_params.scale, q_params.zero_point); } +float calculate_quant_loss( + const float* input, + int numel, + float xmin, + float xmax, + float* q_input, + int bit_width) { + xmin = static_cast(xmin); + float data_range = xmax - xmin; + float qmax = (1 << bit_width) - 1; + float scale = data_range == 0 + ? 1.0 + : static_cast(static_cast(data_range / qmax)); + float inverse_scale = 1.0f / scale; + + float norm = 0.0f; + constexpr int VLEN = 8; + int i = 0; + +// TODO add FBGEMM kernel +// #ifdef USE_FBGEMM +// #endif + + // remainder loop + for (; i < numel; i++) { + q_input[i] = std::max( + 0.0f, std::min(nearbyint((input[i] - xmin) * inverse_scale), qmax)); + q_input[i] = q_input[i] * scale + xmin; + norm += (input[i] - q_input[i]) * (input[i] - q_input[i]); + } + return std::sqrt(norm); +} + +/* + Helper function to find the best min/max for a tensor to calculate qparams. + It uses a greedy approach to nudge the min and max and calculate the l2 norm + and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))` + Returns the optimized xmax and xmin value of the tensor. +*/ +std::tuple choose_qparams_optimized( + const at::Tensor& input_tensor, + int64_t numel, + const int64_t n_bins, + const double ratio, + int64_t bit_width) { + + const float* input_row = input_tensor.data_ptr(); + float xmin = *std::min_element(input_row, input_row + numel); + float xmax = *std::max_element(input_row, input_row + numel); + + float stepsize = (xmax - xmin) / n_bins; + int min_bins = n_bins * (1.0 - (float) ratio); + const float* input = input_tensor.contiguous().data_ptr(); + std::vector q_input(numel); + + float loss = + calculate_quant_loss(input, numel, xmin, xmax, q_input.data(), bit_width); + float best_loss = loss; + + float cur_min = xmin; + float cur_max = xmax; + float cur_loss = loss; + + float thr = min_bins * stepsize; + while (cur_min + thr < cur_max) { + // move left + float loss1 = calculate_quant_loss( + input, numel, cur_min + stepsize, cur_max, q_input.data(), bit_width); + // move right + float loss2 = calculate_quant_loss( + input, numel, cur_min, cur_max - stepsize, q_input.data(), bit_width); + if (cur_loss < loss1 && cur_loss < loss2 && cur_loss < best_loss) { + // found a local optima + best_loss = cur_loss; + xmin = cur_min; + xmax = cur_max; + } + if (loss1 < loss2) { + cur_min = cur_min + stepsize; + cur_loss = loss1; + } else { + cur_max = cur_max - stepsize; + cur_loss = loss2; + } + } + + return std::make_tuple((float) xmax, (float) xmin); +} } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index dc1f26345e62..6c67b6cc6c86 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -137,7 +137,10 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { return output; } -Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { +Tensor _qembeddingbag_nbit_prepack_helper( + const Tensor& weight, + int bit_width, + bool optimized_qparams) { int64_t embedding_rows = weight.size(0); int64_t embedding_cols = weight.size(1); @@ -145,16 +148,16 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { const auto weight_data = weight.data_ptr(); TORCH_CHECK( - BIT_RATE == 4 || BIT_RATE == 2, - "BIT_RATE must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'." - "For 8bit, consider using 'embedding_bag_byte_prepack'."); + bit_width == 4 || bit_width == 2, + "bit_width must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'." + "For 8bit, consider using 'embedding_bag_byte_prepack'."); - int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; + int NUM_ELEM_PER_BYTE = 8 / bit_width; TORCH_CHECK( weight_contig.size(weight.dim() - 1) % NUM_ELEM_PER_BYTE == 0, - "qembeddingbag_" + c10::to_string(BIT_RATE) + - "bit_prepack only works for the number of columns a multiple of " - + c10::to_string(NUM_ELEM_PER_BYTE)); + "qembeddingbag_" + c10::to_string(bit_width) + + "bit_prepack only works for the number of columns a multiple of " + + c10::to_string(NUM_ELEM_PER_BYTE)); // The "fused" representation stores the scale and bias with the // row-wise quantized data in one tensor. @@ -178,16 +181,20 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { const float* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; - float Xmin = *std::min_element(input_row, input_row + embedding_cols); - float Xmax = *std::max_element(input_row, input_row + embedding_cols); - + float Xmin, Xmax; + if (optimized_qparams) { + std::tie(Xmax, Xmin) = at::choose_qparams_optimized( + weight_contig[row], embedding_cols, 200, 0.16, bit_width); + } else { + Xmin = *std::min_element(input_row, input_row + embedding_cols); + Xmax = *std::max_element(input_row, input_row + embedding_cols); + } Xmin = static_cast(Xmin); - const float range = Xmax - Xmin; - + float range = Xmax - Xmin; // Set scale to 1.0f for the corner case of Xmax == Xmin . // Any non-zero scale would work because during quantization // (X - Xmin) / scale will be 0 for all X unless scale is 0. - at::Half scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1); + at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1); float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; if (scale == 0 || std::isinf(inverse_scale)) { // Corner case handling when Xmax == Xmin @@ -195,7 +202,6 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { scale = 1.0f; inverse_scale = 1.0f; } - // Update the scale and zero_point of each row. at::Half* output_row_scale_zp = reinterpret_cast( output_row + @@ -209,15 +215,14 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { float X = input_row[col]; std::uint8_t quantized = std::max( 0, - std::min( - lrintf((X - Xmin) * inverse_scale), (1 << BIT_RATE) - 1)); + std::min(lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1)); // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits // and index 1 is packed in the upper 4-bits. if (col % NUM_ELEM_PER_BYTE == 0) { output_row[col / NUM_ELEM_PER_BYTE] = quantized; } else { output_row[col / NUM_ELEM_PER_BYTE] |= - (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE)); + (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width)); } } // embedding_cols } // embedding_rows @@ -231,8 +236,9 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { // To later de-quantize values, the scale (range / 15) and zero_point // are stored alongside the data. More precisely, each row first has quantized // values, and then 2-byte fp16 scale and 2-byte zero_offset. -Tensor qembeddingbag_4bit_prepack(const Tensor& weight) { - return _qembeddingbag_nbit_prepack_helper(weight, 4 /*BIT_RATE*/); +Tensor qembeddingbag_4bit_prepack(const Tensor& weight, bool optimized_qparams) { + return _qembeddingbag_nbit_prepack_helper( + weight, 4 /*bit_width*/, optimized_qparams); } // Applies 2-bit row-wise quantization by determining the range @@ -243,8 +249,9 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight) { // are stored alongside the data. More precisely, each row first has quantized // values, and then 2-byte fp16 scale and 2-byte zero_offset. // TODO() - Add 2Bit Embedding Lookup operator. -Tensor qembeddingbag_2bit_prepack(const Tensor& weight) { - return _qembeddingbag_nbit_prepack_helper(weight, 2 /*BIT_RATE*/); +Tensor qembeddingbag_2bit_prepack(const Tensor& weight, bool optimized_qparams) { + return _qembeddingbag_nbit_prepack_helper( + weight, 2 /*bit_width*/, optimized_qparams); } class QEmbeddingPackWeights final { @@ -255,9 +262,9 @@ class QEmbeddingPackWeights final { }; TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("embedding_bag_byte_prepack", qembeddingbag_byte_prepack); - m.impl("embedding_bag_4bit_prepack", qembeddingbag_4bit_prepack); - m.impl("embedding_bag_2bit_prepack", qembeddingbag_2bit_prepack); + m.impl("embedding_bag_byte_prepack", TORCH_FN(qembeddingbag_byte_prepack)); + m.impl("embedding_bag_4bit_prepack", TORCH_FN(qembeddingbag_4bit_prepack)); + m.impl("embedding_bag_2bit_prepack", TORCH_FN(qembeddingbag_2bit_prepack)); } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index c8e247b42365..6049ccbe1e46 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -107,9 +107,9 @@ TORCH_LIBRARY(quantized, m) { m.def("embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"); m.def("embedding_bag_byte_prepack(Tensor weight) -> Tensor"); m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor"); - m.def("embedding_bag_4bit_prepack(Tensor weight) -> Tensor"); + m.def("embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"); m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor"); - m.def("embedding_bag_2bit_prepack(Tensor weight) -> Tensor"); + m.def("embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"); m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor"); m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor"); m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"); diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py index 674ace864343..9412332c238b 100644 --- a/test/quantization/test_quantized_op.py +++ b/test/quantization/test_quantized_op.py @@ -119,7 +119,6 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type): X_scale = 1e-10 return X, X_scale, X_zero_point - class TestQuantizedOps(TestCase): """Helper function to test quantized activation functions.""" @@ -2718,11 +2717,14 @@ def test_qlinear_unpack(self, W, use_channelwise): @unittest.skipIf(sys.platform == "darwin", "Known test failure on Mac.") class TestQuantizedEmbeddingOps(TestCase): - def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate): + def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate, optimized_qparams): weights = torch.from_numpy((np.random.random_sample(( num_embeddings, embedding_dim)) + 1).astype(np.float32)) - w_packed = pack_fn(weights) + if bit_rate == 8: + w_packed = pack_fn(weights) + else: + w_packed = pack_fn(weights, optimized_qparams=optimized_qparams) w_unpacked = unpack_fn(w_packed) if bit_rate == 8: @@ -2753,13 +2755,13 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe conversion_op = "FloatToFused2BitRowwiseQuantized" reverse_conversion_op = "Fused2BitRowwiseQuantizedToFloat" - def get_c2_weights(weights): + def get_c2_weights(weights, engine_str): workspace.ResetWorkspace() workspace.FeedBlob("weights", weights) workspace.RunOperatorOnce( core.CreateOperator( - conversion_op, ["weights"], ["quantized_weights"] + conversion_op, ["weights"], ["quantized_weights"], engine=engine_str ) ) emb_q = workspace.FetchBlob("quantized_weights") @@ -2776,7 +2778,11 @@ def get_c2_weights(weights): ) return torch.from_numpy(emb_q), dequantized_data - w_packed_c2, w_unpacked_c2 = get_c2_weights(weights) + if optimized_qparams: + engine = "GREEDY" + else: + engine = "" + w_packed_c2, w_unpacked_c2 = get_c2_weights(weights, engine) # Compare packed weights against C2. np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6) @@ -2790,25 +2796,27 @@ def test_embedding_bag_byte_unpack(self, num_embeddings, embedding_dim): pack_fn = torch.ops.quantized.embedding_bag_byte_prepack unpack_fn = torch.ops.quantized.embedding_bag_byte_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=8) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 8, False) """ Tests the correctness of the embedding_bag_4bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), - embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),) - def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim): + embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0), + optimized_qparams=st.booleans(),) + def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams): pack_fn = torch.ops.quantized.embedding_bag_4bit_prepack unpack_fn = torch.ops.quantized.embedding_bag_4bit_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=4) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 4, optimized_qparams) """ Tests the correctness of the embedding_bag_2bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), - embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),) - def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim): + embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0), + optimized_qparams=st.booleans(),) + def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams): pack_fn = torch.ops.quantized.embedding_bag_2bit_prepack unpack_fn = torch.ops.quantized.embedding_bag_2bit_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=2) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 2, optimized_qparams) def embedding_bag_rowwise_offsets_run( self, bit_rate, num_embeddings, diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index b7fa4a3a8308..995dff38030b 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -404,6 +404,7 @@ def get_cpp_formal(arg, ensure_temp_safe=True): 'std::tuple', 'std::tuple', 'std::tuple', + 'std::tuple', 'std::vector', 'Scalar', 'bool', 'int64_t', 'void*', 'void', 'QScheme', 'double', diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp index ce6d5cbc23ff..5c6851ce4fab 100644 --- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp +++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp @@ -291,9 +291,13 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) { auto observer_out = observer->output(); std::string prepack_fn, quant_fn; + std::vector prepack_inputs = {observer_out}; if (op_name == "embedding_bag_4bit") { + bool optimized_qparams = false; + Value* optimized_qparams_false = g->insertConstant(optimized_qparams); prepack_fn = "quantized::embedding_bag_4bit_prepack"; quant_fn = "quantized::embedding_bag_4bit_rowwise_offsets"; + prepack_inputs.push_back(optimized_qparams_false); } else if (op_name == "embedding_bag_byte") { prepack_fn = "quantized::embedding_bag_byte_prepack"; quant_fn = "quantized::embedding_bag_byte_rowwise_offsets"; @@ -302,7 +306,6 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) { "Graph Mode Quantization currently supports 4-bit and 8-bit embedding bag quantization."); } - std::vector prepack_inputs = {observer_out}; std::vector uses = observer_out->uses(); Node* embedding_bag_float_op; // We expect that the output of the weight observer will be consumed by the diff --git a/torch/overrides.py b/torch/overrides.py index b287bf17958a..d5f247e5d51a 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -277,6 +277,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.cholesky: lambda input, upper=False, out=None: -1, torch.cholesky_inverse: lambda input, upper=False, out=None: -1, torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1, + torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1, torch.chunk: lambda input, chunks, dim=0: -1, torch.clamp: lambda input, min=None, max=None, out=None: -1, torch.clip: lambda input, min=None, max=None, out=None: -1, From c760bc8fb15a13a66d514b4107ff6ea5d1720e6c Mon Sep 17 00:00:00 2001 From: Jordan Fix Date: Wed, 23 Sep 2020 20:47:51 -0700 Subject: [PATCH 074/449] Add GlowLoadAOTModel flag (#45189) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45189 Pull Request resolved: https://github.com/pytorch/glow/pull/4902 Test Plan: Test locally Reviewed By: yinghai Differential Revision: D23810445 fbshipit-source-id: 56e717d80abbfe76b15d0f4249e1e399a9722753 --- caffe2/opt/onnxifi_op.h | 5 ++++- third_party/foxi | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h index 6a211a604d52..f19403a14e58 100644 --- a/caffe2/opt/onnxifi_op.h +++ b/caffe2/opt/onnxifi_op.h @@ -263,10 +263,13 @@ class OnnxifiOp final : public Operator { defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__"); } onnxGraph graph{nullptr}; + + static const uint64_t auxPropertiesListAOT[] = { + ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE}; CAFFE_ENFORCE_EQ( lib_->onnxInitGraph( backend, - nullptr, + use_glow_aot_ ? auxPropertiesListAOT : nullptr, onnx_model_str.size(), (const void*)(onnx_model_str.c_str()), weight_descs.size(), diff --git a/third_party/foxi b/third_party/foxi index 9ca418d2f4bc..4aba696ec8f3 160000 --- a/third_party/foxi +++ b/third_party/foxi @@ -1 +1 @@ -Subproject commit 9ca418d2f4bc8e022d843388afa0fd0a14bd57dc +Subproject commit 4aba696ec8f31794fd42880346dc586486205e0a From 2d00ebd29f2363a2f51e88e4c898244679d114f4 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Wed, 23 Sep 2020 21:13:48 -0700 Subject: [PATCH 075/449] Failing test demonstrating problems with mixed output shapes (#44455) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44455 Test Plan: Imported from OSS Reviewed By: gmagogsfm Differential Revision: D23886119 Pulled By: bertmaher fbshipit-source-id: 41787930f154cf4e8a1766613c4cf33b18246555 --- test/test_jit_fuser_te.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index f9aca9a5dea1..dc7e67a14ee2 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1311,6 +1311,36 @@ def fn(x): self.assertEqual(ref, t(x)) self.assertEqual(len(self.findFusionGroups(t.graph_for(x))), 0) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_superslomo(self): + # Test extracted from Super-SloMo: https://github.com/avinashpaliwal/Super-SloMo + # A few interesting things happen here: strided inputs of mixed size, + # plus outputs of mixed shapes. The latter characteristic happened to + # expose a memory corruption bug due to not properly guarding the + # outputs. + def eager(t0, t1, t2, t3, t4): + t5 = torch.mul(t0, t4) + t6 = torch.mul(t2, t3) + t7 = torch.mul(t6, t1) + t9 = torch.add(t5, t7) + t11 = torch.add(t0, t6) + ft_p = torch.div(t9, t11) + return (ft_p, t11, t9, t6) + + t0 = torch.rand(1, 6, 352, 352, device="cuda").transpose(0, 1) + t1 = torch.rand(6, 3, 352, 352, device="cuda") + t2 = torch.rand(6, device="cuda")[None, None, None, :].permute(3, 0, 1, 2) + t3 = torch.rand(6, 1, 352, 352, device="cuda") + t4 = torch.rand(6, 3, 352, 352, device="cuda") + inputs = [t0, t1, t2, t3, t4] + + script = torch.jit.script(eager) + for _ in range(4): + for pair in zip(script(*inputs), eager(*inputs)): + test, ref = pair + torch.testing.assert_allclose(test, ref) + self.assertAllFused(script.graph_for(*inputs)) + if __name__ == '__main__': run_tests() From 956a25d0614dc783931e73e57ac993a885fac3ed Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Wed, 23 Sep 2020 21:16:52 -0700 Subject: [PATCH 076/449] Revert D23858329: [PT Model Split] Support 2 operators in PT by C2 conversion Test Plan: revert-hammer Differential Revision: D23858329 (https://github.com/pytorch/pytorch/commit/721cfbf8425cf2c1dc5e27d1332e32e1a42ef541) Original commit changeset: ed37118ca7f0 fbshipit-source-id: 30c700f80665be11afc608b00a77766064e60b35 --- caffe2/operators/gather_ranges_to_dense_op.cc | 8 --- caffe2/operators/gather_ranges_to_dense_op.h | 3 - .../operator_test/torch_integration_test.py | 69 +------------------ 3 files changed, 1 insertion(+), 79 deletions(-) diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc index aa31ef12b36a..10396aafc97e 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.cc +++ b/caffe2/operators/gather_ranges_to_dense_op.cc @@ -104,11 +104,3 @@ NO_GRADIENT(GatherRangesToDense); } // namespace } // namespace caffe2 - -using GatherRangesToDenseCPUOp = - caffe2::GatherRangesToDenseOp; - -C10_EXPORT_CAFFE2_OP_TO_C10_CPU( - GatherRangesToDense, - "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs", - GatherRangesToDenseCPUOp); diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index 217a61b25129..c1dd5a527005 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -5,7 +5,6 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" -#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -16,8 +15,6 @@ #include #include -C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense); - namespace caffe2 { template class GatherRangesToDenseOp final : public Operator { diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index 7194daa91203..55f26a89987f 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -7,12 +7,10 @@ import torch import unittest -from caffe2.python import core, dyndep, workspace +from caffe2.python import core, workspace from hypothesis import given, settings from scipy.stats import norm -dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/operators:calibration_op') - def generate_rois(roi_counts, im_dims): assert len(roi_counts) == len(im_dims) @@ -877,71 +875,6 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries): ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) - def test_gather_ranges_to_dense_op(self): - data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) - ranges = np.array([[[2, 4]], [[0, 0]]]) - key = np.array([0, 1, 3, 2, 1, 0, 1, 0]) - lengths = np.array([4]) - min_observation = 2 - max_mismatched_ratio = 0.5 - max_empty_ratio = 1.0 - - outputs_name = ["X_{}".format(i) for i in range(len(lengths))] - ref_op = core.CreateOperator( - "GatherRangesToDense", - ["data", "ranges", "key"], - outputs_name, - lengths=lengths, - min_observation=min_observation, - max_mismatched_ratio=max_mismatched_ratio, - max_empty_ratio=max_empty_ratio, - ) - workspace.FeedBlob("data", data) - workspace.FeedBlob("ranges", ranges) - workspace.FeedBlob("key", key) - workspace.RunOperatorOnce(ref_op) - ref_outputs = [] - for output_name in outputs_name: - ref_outputs.append(workspace.FetchBlob(output_name)) - - outputs = torch.ops._caffe2.GatherRangesToDense( - torch.from_numpy(data), - torch.from_numpy(ranges), - torch.from_numpy(key), - lengths=lengths, - min_observation=min_observation, - max_mismatched_ratio=max_mismatched_ratio, - max_empty_ratio=max_empty_ratio, - ) - - self.assertEqual(len(ref_outputs), len(outputs)) - for i in range(0, len(ref_outputs)): - np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy()) - - def test_prior_correct_calibration_prediction_op(self): - beta = np.array([1.0, 2.0], dtype=np.float32) - gamma = np.array([3.0, 4.0], dtype=np.float32) - pred = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32) - - ref_op = core.CreateOperator( - "PriorCorrectionCalibrationPrediction", - ["beta", "gamma", "pred"], - ["new_pred"], - ) - workspace.FeedBlob("beta", beta) - workspace.FeedBlob("gamma", gamma) - workspace.FeedBlob("pred", pred) - workspace.RunOperatorOnce(ref_op) - ref_output = workspace.FetchBlob("new_pred") - - output = torch.ops._caffe2.PriorCorrectionCalibrationPrediction( - torch.from_numpy(beta), - torch.from_numpy(gamma), - torch.from_numpy(pred), - ) - torch.testing.assert_allclose(ref_output, output) - - @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10)) @settings(deadline=1000) def test_merge_id_lists(self, lengths_0, lengths_1): From 070fe15e4cc4b684205b869fb8ba4625c3311d8f Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 23 Sep 2020 22:01:14 -0700 Subject: [PATCH 077/449] Add link to profiling recipe from rpc main docs (#45235) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45235 This is so that users know that the profiler works as expected with RPC and they can learn how to use it to profile RPC-based workloads. ghstack-source-id: 112773748 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23777888 fbshipit-source-id: 4805be9b949c8c7929182f291a6524c3c6a725c1 --- docs/source/rpc.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst index 37adc14faae1..1e4788c99634 100644 --- a/docs/source/rpc.rst +++ b/docs/source/rpc.rst @@ -293,8 +293,11 @@ The RRef design note covers the design of the :ref:`rref` (Remote REFerence) pro Tutorials --------- -The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc` APIs. +The RPC tutorials introduce users to the RPC framework, provide several example applications +using :ref:`torch.distributed.rpc` APIs, and demonstrate how +to use `the profiler `__ to profile RPC-based workloads. - `Getting started with Distributed RPC Framework `__ - `Implementing a Parameter Server using Distributed RPC Framework `__ - `Combining Distributed DataParallel with Distributed RPC Framework `__ +- `Profiling RPC-based Workloads `__ From 6a2e9eb51c453589cfec7cbf79f429fdf46f1fd4 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 23 Sep 2020 22:07:54 -0700 Subject: [PATCH 078/449] torch.fft: Multi-dimensional transforms (#44550) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44550 Part of the `torch.fft` work (gh-42175). This adds n-dimensional transforms: `fftn`, `ifftn`, `rfftn` and `irfftn`. This is aiming for correctness first, with the implementation on top of the existing `_fft_with_size` restrictions. I plan to follow up later with a more efficient rewrite that makes `_fft_with_size` work with arbitrary numbers of dimensions. Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D23846032 Pulled By: mruberry fbshipit-source-id: e6950aa8be438ec5cb95fb10bd7b8bc9ffb7d824 --- aten/src/ATen/WrapDimUtils.h | 14 +- aten/src/ATen/native/SpectralOps.cpp | 168 +++++++++++++++ aten/src/ATen/native/native_functions.yaml | 20 ++ docs/source/fft.rst | 4 + test/test_spectral_ops.py | 178 +++++++++++++++- torch/csrc/api/include/torch/fft.h | 60 ++++++ torch/fft/__init__.py | 225 +++++++++++++++++++++ 7 files changed, 662 insertions(+), 7 deletions(-) diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index c248ea461116..2768efe6e683 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector& dims, int64_t dim_post_expr) { +// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions +static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) { if (dim_post_expr <= 0) { dim_post_expr = 1; // this will make range [-1, 0] } int64_t min = -dim_post_expr; int64_t max = dim_post_expr - 1; - for (auto& dim : dims) { + for (int64_t i = 0; i < ndims; ++i) { + auto &dim = dims[i]; if (dim < min || dim > max) { TORCH_CHECK_INDEX(false, "Dimension out of range (expected to be in range of [", @@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector& dims, int64_t dim_post_ } } +// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions +// E.g. could also be std::array or c10::SmallVector +template +inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) { + return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr); +} + // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors // to be "skipped" (both for wrap dimension behavior and dimension size checking). diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index e467c21a4a30..120ef9f73042 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -203,6 +203,116 @@ Tensor fft_c2c(Tensor input, c10::optional n_opt, return out; } +// Dimensions to transform, and the signal shape in those dimensions +struct ShapeAndDims { + DimVector shape, dim; +}; + +// Pre-process n-dimensional fft's `s` and `dim` arguments. +// Wraps dimensions and applies defaulting behavior. +// Also checks transform dims are unique and transform shape is non-empty. +ShapeAndDims canonicalize_fft_shape_and_dim_args( + Tensor input, c10::optional shape, c10::optional dim) { + const int64_t input_dim = input.dim(); + const IntArrayRef input_sizes = input.sizes(); + ShapeAndDims ret; + + if (dim) { + ret.dim.resize(dim->size()); + std::copy(dim->begin(), dim->end(), ret.dim.begin()); + maybe_wrap_dims(ret.dim, input_dim); + + // Check dims are unique + DimVector copy = ret.dim; + std::sort(copy.begin(), copy.end()); + auto duplicate = std::adjacent_find(copy.begin(), copy.end()); + TORCH_CHECK(duplicate == copy.end(), "FFT dims must be unique"); + } + + if (shape) { + // Has shape, may have dim + TORCH_CHECK(!dim || dim->size() == shape->size(), + "When given, dim and shape arguments must have the same length"); + TORCH_CHECK(shape->size() <= input_dim, + "Got shape with ", shape->size(), " values but input tensor " + "only has ", input_dim, " dimensions."); + const int64_t transform_ndim = shape->size(); + // If shape is given, dims defaults to the last shape.size() dimensions + if (!dim) { + ret.dim.resize(transform_ndim); + std::iota(ret.dim.begin(), ret.dim.end(), input_dim - transform_ndim); + } + + // Translate shape of -1 to the default length + ret.shape.resize(transform_ndim); + for (int64_t i = 0; i < transform_ndim; ++i) { + const auto n = (*shape)[i]; + ret.shape[i] = n == -1 ? input_sizes[ret.dim[i]] : n; + } + } else if (!dim) { + // No shape, no dim + ret.dim.resize(input_dim); + std::iota(ret.dim.begin(), ret.dim.end(), int64_t{0}); + ret.shape.resize(input_dim); + std::copy(input_sizes.begin(), input_sizes.end(), ret.shape.begin()); + } else { + // No shape, has dim + ret.shape.resize(ret.dim.size()); + for (int64_t i = 0; i < ret.dim.size(); ++i) { + ret.shape[i] = input_sizes[ret.dim[i]]; + } + } + + for (int64_t i = 0; i < ret.shape.size(); ++i) { + TORCH_CHECK(ret.shape[i] > 0, + "Invalid number of data points (", ret.shape[i], ") specified"); + } + + return ret; +} + +// Complex to complex n-dimensional fft +Tensor fftn_c2c( + const Tensor& input, IntArrayRef shape, IntArrayRef dim, + c10::optional norm_str, bool forward) { + TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + const auto input_dim = input.dim(); + + Tensor x = resize_fft_input(input, dim, shape); + x = at::view_as_real(x); + + const int64_t transform_ndim = dim.size(); + const auto norm = norm_from_string(norm_str, forward); + // _fft_with_size only supports 3 dimensions being transformed at a time. + // This limit is inherited from cuFFT. + constexpr int64_t max_signal_ndim = 3; + + // Transform n dimensions, up to 3 at a time + // TODO: rewrite _fft_with_size to transform more than 3 dimensions at once. + for (int64_t i = 0; i < transform_ndim; i += max_signal_ndim) { + const int64_t signal_ndim = std::min(transform_ndim - i, max_signal_ndim); + DimVector source_dim(signal_ndim); + DimVector dest_dim(signal_ndim); + + for (int64_t j = 0; j < signal_ndim; ++j) { + source_dim[j] = dim[i + j]; + dest_dim[j] = j + (input_dim - signal_ndim); + } + + // _fft operates on up-to the last 3 dims, so move selected dims to the end + x = at::movedim(x, source_dim, dest_dim); + + x = _fft(x, signal_ndim, /*complex_input=*/true, /*complex_output=*/true, + /*inverse=*/!forward, /*signal_sizes=*/{}, /*normalization=*/norm, + /*onesided=*/false); + + // Move transform dims back to their original order + x = at::movedim(x, dest_dim, source_dim); + } + + return at::view_as_complex(x); +} + } // torch.fft.fft, analogous to NumPy's numpy.fft.fft @@ -240,6 +350,64 @@ Tensor fft_ihfft(const Tensor& self, c10::optional n, int64_t dim, return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true); } +Tensor fft_fftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + // TODO: For real input, perform rfftn then mirror with conjugate symmetry + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor fft_ifftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false); +} + +Tensor fft_rfftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis"); + + const auto last_dim = desc.dim.back(); + const auto last_shape = desc.shape.back(); + desc.shape.pop_back(); + desc.dim.pop_back(); + + // rfft on last dim to get hermitian complex shape + auto x = native::fft_rfft(self, last_shape, last_dim, norm); + // Normal fft on remaining dims + return fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor fft_irfftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis"); + + const auto last_dim = desc.dim.back(); + const auto last_shape = [&]() -> c10::optional { + // If shape is defaulted in the last dimension, + // pass nullopt to irfft and let it calculate the default size + if (!s.has_value() || (s->back() == -1)) { + return c10::nullopt; + } + return desc.shape.back(); + }(); + desc.shape.pop_back(); + desc.dim.pop_back(); + + // Normal ifft for all but last dim + Tensor x = promote_tensor_fft(self, /*require_complex=*/true); + x = fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/false); + // Then 1d irfft on last dim to get real output + return native::fft_irfft(x, last_shape, last_dim, norm); +} // This is a pass-through wrapper function that does the size check and // inferences. The actual forward implementation function is called diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index ae6afc3818a5..ae3579cd0aa9 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -7943,6 +7943,26 @@ use_c10_dispatcher: full variants: function +- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor use_c10_dispatcher: full variants: function, method diff --git a/docs/source/fft.rst b/docs/source/fft.rst index 8ec06a3574d2..a732f3e5c652 100644 --- a/docs/source/fft.rst +++ b/docs/source/fft.rst @@ -19,7 +19,11 @@ Functions .. autofunction:: fft .. autofunction:: ifft +.. autofunction:: fftn +.. autofunction:: ifftn .. autofunction:: rfft .. autofunction:: irfft +.. autofunction:: rfftn +.. autofunction:: irfftn .. autofunction:: hfft .. autofunction:: ihfft diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 59b58fa202d6..d7ef731699b3 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -3,6 +3,7 @@ import math from contextlib import contextmanager from itertools import product +import itertools from torch.testing._internal.common_utils import \ (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA) @@ -11,7 +12,7 @@ skipCPUIfNoMkl, skipCUDAIfRocm, deviceCountAtLeast, onlyCUDA) from distutils.version import LooseVersion -from typing import Optional +from typing import Optional, List if TEST_NUMPY: @@ -115,6 +116,7 @@ def method_fn(t): @skipCPUIfNoMkl @skipCUDAIfRocm + @onlyOnCPUAndCUDA @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4}) @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) @@ -226,11 +228,13 @@ def test_fft_round_trip(self, device, dtype): def test_empty_fft(self, device, dtype): t = torch.empty(0, device=device, dtype=dtype) match = r"Invalid number of data points \([-\d]*\) specified" - fft_functions = [torch.fft.fft, torch.fft.ifft, torch.fft.hfft, - torch.fft.irfft] + fft_functions = [torch.fft.fft, torch.fft.fftn, + torch.fft.ifft, torch.fft.ifftn, + torch.fft.irfft, torch.fft.irfftn, + torch.fft.hfft] # Real-only functions if not dtype.is_complex: - fft_functions += [torch.fft.rfft, torch.fft.ihfft] + fft_functions += [torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft] for fn in fft_functions: with self.assertRaisesRegex(RuntimeError, match): @@ -242,6 +246,9 @@ def test_fft_invalid_dtypes(self, device): with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): torch.fft.rfft(t) + with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + torch.fft.rfftn(t) + with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): torch.fft.ihfft(t) @@ -292,7 +299,9 @@ def test_fft_half_errors(self, device, dtype): # TODO: Remove torch.half error when complex32 is fully implemented x = torch.randn(64, device=device).to(dtype) fft_functions = (torch.fft.fft, torch.fft.ifft, + torch.fft.fftn, torch.fft.ifftn, torch.fft.rfft, torch.fft.irfft, + torch.fft.rfftn, torch.fft.irfftn, torch.fft.hfft, torch.fft.ihfft) for fn in fft_functions: with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "): @@ -300,6 +309,7 @@ def test_fft_half_errors(self, device, dtype): @skipCPUIfNoMkl @skipCUDAIfRocm + @onlyOnCPUAndCUDA @dtypes(torch.double, torch.complex128) # gradcheck requires double def test_fft_backward(self, device, dtype): test_args = list(product( @@ -340,6 +350,166 @@ def test_fn(x): self.assertTrue(torch.autograd.gradcheck(test_fn, (input,))) + # nd-fft tests + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4}) + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftn_numpy(self, device, dtype): + norm_modes = ((None, "forward", "backward", "ortho") + if LooseVersion(np.__version__) >= '1.20.0' + else (None, "ortho")) + + # input_ndim, s, dim + transform_desc = [ + *product(range(2, 5), (None,), (None, (0,), (0, -1))), + *product(range(2, 5), (None, (4, 10)), (None,)), + (6, None, None), + (5, None, (1, 3, 4)), + (3, None, (0, -1)), + (3, None, (1,)), + (1, None, (0,)), + (4, (10, 10), None), + (4, (10, 10), (0, 1)) + ] + + fft_functions = ['fftn', 'ifftn', 'irfftn'] + # Real-only functions + if not dtype.is_complex: + fft_functions += ['rfftn'] + + for input_ndim, s, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + input = torch.randn(*shape, device=device, dtype=dtype) + for fname, norm in product(fft_functions, norm_modes): + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + def fn(t: torch.Tensor, s: Optional[List[int]], dim: Optional[List[int]], norm: Optional[str]): + return torch_fn(t, s, dim, norm) + + torch_fns = (torch_fn, torch.jit.script(fn)) + + expected = numpy_fn(input.cpu().numpy(), s, dim, norm) + exact_dtype = dtype in (torch.double, torch.complex128) + for fn in torch_fns: + actual = fn(input, s, dim, norm) + self.assertEqual(actual, expected, exact_dtype=exact_dtype) + + @skipCUDAIfRocm + @skipCPUIfNoMkl + @onlyOnCPUAndCUDA + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftn_round_trip(self, device, dtype): + norm_modes = (None, "forward", "backward", "ortho") + + # input_ndim, dim + transform_desc = [ + *product(range(2, 5), (None, (0,), (0, -1))), + *product(range(2, 5), (None,)), + (7, None), + (5, (1, 3, 4)), + (3, (0, -1)), + (3, (1,)), + (1, 0), + ] + + fft_functions = [(torch.fft.fftn, torch.fft.ifftn)] + + # Real-only functions + if not dtype.is_complex: + fft_functions += [(torch.fft.rfftn, torch.fft.irfftn)] + + for input_ndim, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + x = torch.randn(*shape, device=device, dtype=dtype) + + for (forward, backward), norm in product(fft_functions, norm_modes): + if isinstance(dim, tuple): + s = [x.size(d) for d in dim] + else: + s = x.size() if dim is None else x.size(dim) + + kwargs = {'s': s, 'dim': dim, 'norm': norm} + y = backward(forward(x, **kwargs), **kwargs) + # For real input, ifftn(fftn(x)) will convert to complex + self.assertEqual(x, y, exact_dtype=( + forward != torch.fft.fftn or x.is_complex())) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @dtypes(torch.double, torch.complex128) # gradcheck requires double + def test_fftn_backward(self, device, dtype): + # input_ndim, s, dim + transform_desc = [ + *product((2, 3), (None,), (None, (0,), (0, -1))), + *product((2, 3), (None, (4, 10)), (None,)), + (4, None, None), + (3, (10, 10), (0, 1)), + (2, (1, 1), (0, 1)), + (2, None, (1,)), + (1, None, (0,)), + (1, (11,), (0,)), + ] + norm_modes = (None, "forward", "backward", "ortho") + + fft_functions = ['fftn', 'ifftn', 'irfftn'] + # Real-only functions + if not dtype.is_complex: + fft_functions += ['rfftn'] + + for input_ndim, s, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + input = torch.randn(*shape, device=device, dtype=dtype) + + for fname, norm in product(fft_functions, norm_modes): + torch_fn = getattr(torch.fft, fname) + + # Workaround for gradcheck's poor support for complex input + # Use real input instead and put view_as_complex into the graph + if dtype.is_complex: + def test_fn(x): + return torch_fn(torch.view_as_complex(x), s, dim, norm) + inputs = (torch.view_as_real(input).detach().requires_grad_(),) + else: + def test_fn(x): + return torch_fn(x, s, dim, norm) + inputs = (input.detach().requires_grad_(),) + + self.assertTrue(torch.autograd.gradcheck(test_fn, inputs)) + + @skipCUDAIfRocm + @skipCPUIfNoMkl + @onlyOnCPUAndCUDA + def test_fftn_invalid(self, device): + a = torch.rand(10, 10, 10, device=device) + fft_funcs = (torch.fft.fftn, torch.fft.ifftn, + torch.fft.rfftn, torch.fft.irfftn) + + for func in fft_funcs: + with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + func(a, dim=(0, 1, 0)) + + with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + func(a, dim=(2, -1)) + + with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): + func(a, s=(1,), dim=(0, 1)) + + with self.assertRaisesRegex(IndexError, "Dimension out of range"): + func(a, dim=(3,)) + + with self.assertRaisesRegex(RuntimeError, "tensor only has 3 dimensions"): + func(a, s=(10, 10, 10, 10)) + + c = torch.complex(a, a) + with self.assertRaisesRegex(RuntimeError, "Expected a real input"): + torch.fft.rfftn(c) + # Legacy fft tests def _test_fft_ifft_rfft_irfft(self, device, dtype): def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x): diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h index 9622f668214f..1c119ed75226 100644 --- a/torch/csrc/api/include/torch/fft.h +++ b/torch/csrc/api/include/torch/fft.h @@ -35,6 +35,36 @@ inline Tensor ifft(const Tensor& self, return torch::fft_ifft(self, n, dim, norm); } +/// Computes the N dimensional fast Fourier transform over given dimensions. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::fftn(t); +/// ``` +inline Tensor fftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_fftn(self, s, dim, norm); +} + +/// Computes the N dimensional fast Fourier transform over given dimensions. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::ifftn(t); +/// ``` +inline Tensor ifftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_ifftn(self, s, dim, norm); +} + /// Computes the 1 dimensional FFT of real input with onesided Hermitian output. /// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft. /// @@ -69,6 +99,36 @@ inline Tensor irfft(const Tensor& self, return torch::fft_irfft(self, n, dim, norm); } +/// Computes the N dimensional FFT of real input with onesided Hermitian output. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kDouble); +/// torch::fft::rfftn(t); +/// ``` +inline Tensor rfftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_rfftn(self, s, dim, norm); +} + +/// Computes the inverse of torch.fft.rfftn. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::irfftn(t); +/// ``` +inline Tensor irfftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_irfftn(self, s, dim, norm); +} + /// Computes the 1 dimensional FFT of a onesided Hermitian signal /// /// The input represents a Hermitian symmetric time domain signal. The returned diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py index 08b5d28b05ae..3e4bcc35464b 100644 --- a/torch/fft/__init__.py +++ b/torch/fft/__init__.py @@ -87,6 +87,101 @@ tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j]) """) +fftn = _add_docstr(_fft.fft_fftn, r""" +fftn(input, s=None, dim=None, norm=None) -> Tensor + +Computes the N dimensional discrete Fourier transform of :attr:`input`. + +Note: + + The Fourier domain representation of any real signal satisfies the + Hermitian property: ``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])``. This + function always returns all positive and negative frequency terms even + though, for real inputs, half of these values are redundant. + :func:`~torch.fft.rfftn` returns the more compact one-sided representation + where only the positive frequencies of the last dimension are returned. + +Args: + input (Tensor): the input tensor + s (Tuple[int], optional): Signal size in the transformed dimensions. + If given, each dimension ``dim[i]`` will either be zero-padded or + trimmed to the length ``s[i]`` before computing the FFT. + If a length ``-1`` is specified, no padding is done in that dimension. + Default: ``s = [input.size(d) for d in dim]`` + dim (Tuple[int], optional): Dimensions to be transformed. + Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given. + norm (str, optional): Normalization mode. For the forward transform + (:func:`~torch.fft.fftn`), these correspond to: + + * ``"forward"`` - normalize by ``1/n`` + * ``"backward"`` - no normalization + * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal) + + Where ``n = prod(s)`` is the logical FFT size. + Calling the backward transform (:func:`~torch.fft.ifftn`) with the same + normalization mode will apply an overall normalization of ``1/n`` + between the two transforms. This is required to make + :func:`~torch.fft.ifftn` the exact inverse. + + Default is ``"backward"`` (no normalization). + +Example: + + >>> import torch.fft + >>> x = torch.rand(10, 10, dtype=torch.complex64) + >>> fftn = torch.fft.fftn(t) + + The discrete Fourier transform is separable, so :func:`~torch.fft.fftn` + here is equivalent to two one-dimensional :func:`~torch.fft.fft` calls: + + >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1) + >>> torch.allclose(fftn, two_ffts) + +""") + +ifftn = _add_docstr(_fft.fft_ifftn, r""" +ifftn(input, s=None, dim=None, norm=None) -> Tensor + +Computes the N dimensional inverse discrete Fourier transform of :attr:`input`. + +Args: + input (Tensor): the input tensor + s (Tuple[int], optional): Signal size in the transformed dimensions. + If given, each dimension ``dim[i]`` will either be zero-padded or + trimmed to the length ``s[i]`` before computing the IFFT. + If a length ``-1`` is specified, no padding is done in that dimension. + Default: ``s = [input.size(d) for d in dim]`` + dim (Tuple[int], optional): Dimensions to be transformed. + Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given. + norm (str, optional): Normalization mode. For the backward transform + (:func:`~torch.fft.ifftn`), these correspond to: + + * ``"forward"`` - no normalization + * ``"backward"`` - normalize by ``1/n`` + * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal) + + Where ``n = prod(s)`` is the logical IFFT size. + Calling the forward transform (:func:`~torch.fft.fftn`) with the same + normalization mode will apply an overall normalization of ``1/n`` between + the two transforms. This is required to make :func:`~torch.fft.ifftn` + the exact inverse. + + Default is ``"backward"`` (normalize by ``1/n``). + +Example: + + >>> import torch.fft + >>> x = torch.rand(10, 10, dtype=torch.complex64) + >>> ifftn = torch.fft.ifftn(t) + + The discrete Fourier transform is separable, so :func:`~torch.fft.ifftn` + here is equivalent to two one-dimensional :func:`~torch.fft.ifft` calls: + + >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1) + >>> torch.allclose(ifftn, two_iffts) + +""") + rfft = _add_docstr(_fft.fft_rfft, r""" rfft(input, n=None, dim=-1, norm=None) -> Tensor @@ -199,6 +294,136 @@ tensor([0.0000, 1.0000, 2.0000, 3.0000, 4.0000]) """) +rfftn = _add_docstr(_fft.fft_rfftn, r""" +rfftn(input, s=None, dim=None, norm=None) -> Tensor + +Computes the N-dimensional discrete Fourier transform of real :attr:`input`. + +The FFT of a real signal is Hermitian-symmetric, +``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])`` so the full +:func:`~torch.fft.fftn` output contains redundant information. +:func:`~torch.fft.rfftn` instead omits the negative frequencies in the +last dimension. + +Args: + input (Tensor): the input tensor + s (Tuple[int], optional): Signal size in the transformed dimensions. + If given, each dimension ``dim[i]`` will either be zero-padded or + trimmed to the length ``s[i]`` before computing the real FFT. + If a length ``-1`` is specified, no padding is done in that dimension. + Default: ``s = [input.size(d) for d in dim]`` + dim (Tuple[int], optional): Dimensions to be transformed. + Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given. + norm (str, optional): Normalization mode. For the forward transform + (:func:`~torch.fft.rfftn`), these correspond to: + + * ``"forward"`` - normalize by ``1/n`` + * ``"backward"`` - no normalization + * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real FFT orthonormal) + + Where ``n = prod(s)`` is the logical FFT size. + Calling the backward transform (:func:`~torch.fft.irfftn`) with the same + normalization mode will apply an overall normalization of ``1/n`` between + the two transforms. This is required to make :func:`~torch.fft.irfftn` + the exact inverse. + + Default is ``"backward"`` (no normalization). + +Example: + + >>> import torch.fft + >>> t = torch.rand(10, 10) + >>> rfftn = torch.fft.rfftn(t) + >>> rfftn.size() + torch.Size([10, 6]) + + Compared against the full output from :func:`~torch.fft.fftn`, we have all + elements up to the Nyquist frequency. + + >>> fftn = torch.fft.fftn(t) + >>> torch.allclose(fftn[..., :6], rfftn) + True + + The discrete Fourier transform is separable, so :func:`~torch.fft.rfftn` + here is equivalent to a combination of :func:`~torch.fft.fft` and + :func:`~torch.fft.rfft`: + + >>> two_ffts = torch.fft.fft(torch.fft.rfft(x, dim=1), dim=0) + >>> torch.allclose(rfftn, two_ffts) + +""") + +irfftn = _add_docstr(_fft.fft_irfftn, r""" +irfftn(input, s=None, dim=None, norm=None) -> Tensor + +Computes the inverse of :func:`~torch.fft.rfftn`. + +:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier +domain, as produced by :func:`~torch.fft.rfftn`. By the Hermitian property, the +output will be real-valued. + +Note: + Some input frequencies must be real-valued to satisfy the Hermitian + property. In these cases the imaginary component will be ignored. + For example, any imaginary component in the zero-frequency term cannot + be represented in a real output and so will always be ignored. + +Note: + The correct interpretation of the Hermitian input depends on the length of + the original data, as given by :attr:`s`. This is because each input shape + could correspond to either an odd or even length signal. By default, the + signal is assumed to be even length and odd signals will not round-trip + properly. So, it is recommended to always pass the signal shape :attr:`s`. + +Args: + input (Tensor): the input tensor + s (Tuple[int], optional): Signal size in the transformed dimensions. + If given, each dimension ``dim[i]`` will either be zero-padded or + trimmed to the length ``s[i]`` before computing the real FFT. + If a length ``-1`` is specified, no padding is done in that dimension. + Defaults to even output in the last dimension: + ``s[-1] = 2*(input.size(dim[-1]) - 1)``. + dim (Tuple[int], optional): Dimensions to be transformed. + The last dimension must be the half-Hermitian compressed dimension. + Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given. + norm (str, optional): Normalization mode. For the backward transform + (:func:`~torch.fft.irfftn`), these correspond to: + + * ``"forward"`` - no normalization + * ``"backward"`` - normalize by ``1/n`` + * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal) + + Where ``n = prod(s)`` is the logical IFFT size. + Calling the forward transform (:func:`~torch.fft.rfftn`) with the same + normalization mode will apply an overall normalization of ``1/n`` between + the two transforms. This is required to make :func:`~torch.fft.irfftn` + the exact inverse. + + Default is ``"backward"`` (normalize by ``1/n``). + +Example: + + >>> import torch.fft + >>> t = torch.rand(10, 9) + >>> T = torch.fft.rfftn(t) + + Without specifying the output length to :func:`~torch.fft.irfft`, the output + will not round-trip properly because the input is odd-length in the last + dimension: + + >>> torch.fft.irfftn(T).size() + torch.Size([10, 10]) + + So, it is recommended to always pass the signal shape :attr:`s`. + + >>> roundtrip = torch.fft.irfftn(T, t.size()) + >>> roundtrip.size() + torch.Size([10, 9]) + >>> torch.allclose(roundtrip, t) + True + +""") + hfft = _add_docstr(_fft.fft_hfft, r""" hfft(input, n=None, dim=-1, norm=None) -> Tensor From 0b6b7358633dfaa84881ae00608b621e1e35c6fc Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 23 Sep 2020 22:21:23 -0700 Subject: [PATCH 079/449] [fix] type promotion atan2 (#43466) Summary: Fixes https://github.com/pytorch/pytorch/issues/43360 Pull Request resolved: https://github.com/pytorch/pytorch/pull/43466 Reviewed By: malfet Differential Revision: D23834928 Pulled By: mruberry fbshipit-source-id: 2e7e0b4fcf1a846efc171c275d65a6daffd3c631 --- aten/src/ATen/native/BinaryOps.cpp | 8 +++++--- test/test_torch.py | 21 ++++++++++++++++++- test/test_type_promotion.py | 33 +++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index cab77c25b885..f8af756773c9 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -390,14 +390,16 @@ Tensor rsub(const Tensor& self, const Tensor& other, Scalar alpha) { } Tensor& atan2_out(Tensor& result, const Tensor& self, const Tensor& other) { - auto iter = TensorIterator::binary_op(result, self, other); + auto iter = TensorIterator::binary_float_op(result, self, other); atan2_stub(iter.device_type(), iter); return result; } Tensor atan2(const Tensor& self, const Tensor& other) { - Tensor result = at::empty({0}, self.options()); - return native::atan2_out(result, self, other); + Tensor result; + auto iter = TensorIterator::binary_float_op(result, self, other); + atan2_stub(iter.device_type(), iter); + return iter.output(); } Tensor& atan2_(Tensor& self, const Tensor& other) { diff --git a/test/test_torch.py b/test/test_torch.py index 4b08697a908c..70556dd2d2aa 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -19672,10 +19672,20 @@ def test_movedim_view(self, device): torch.int8, torch.short, torch.int, torch.long ] +_integer_types = [ + torch.uint8, torch.int8, torch.int16, + torch.int32, torch.int64 +] + _cpu_types: List[torch.dtype] = [] _unsigned_types = [torch.uint8] +# Binary Float Ops +# Operators which use TensorIterator::binary_float_op +# These Ops promote integer inputs to Float. +binary_float_ops_inplace = ['atan2_', 'div_'] + # Helper values and functions for producing tensors and scalars to use in tensor op tests. # Tensor dimension sizes (Small, Medium, Large, Giant) _S = 5 @@ -19896,7 +19906,7 @@ def inner(self, device, dtype): lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, [_wrap_maybe_warns("This overload of addr_? is deprecated")]), - ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _float_types), + ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half), ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False), ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3), ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3), @@ -20188,6 +20198,15 @@ def fn(self, device, dtype) -> None: (isinstance(arg, torch.Tensor) and arg.dtype == torch.float) else arg for arg in device_args] + # Special case for binary float ops (binary ops that promote int to float) + if op_str in binary_float_ops_inplace and \ + 'inplace' in subtest_str and dtype in _integer_types: + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "): + cpu_result = getattr(cpu_tensor, op_str)(*cpu_args) + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "): + device_result = getattr(device_tensor, op_str)(*device_args) + return # Nothing more to check + # Runs the tensor op on CPU and device cpu_result = getattr(cpu_tensor, op_str)(*cpu_args) device_result = getattr(device_tensor, op_str)(*device_args) diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py index 9ee90c7cbcd8..7f10915a5ac4 100644 --- a/test/test_type_promotion.py +++ b/test/test_type_promotion.py @@ -7,7 +7,7 @@ from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests, TEST_NUMPY, torch_to_numpy_dtype_dict) from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA, - dtypes, onlyCPU) + dtypes, dtypesIfCUDA, onlyCPU) if TEST_NUMPY: import numpy as np @@ -958,6 +958,37 @@ def test_computation_ignores_out(self, device): self.assertEqual(result, a - b, exact_dtype=False) self.assertNotEqual(result, a.double() - b, exact_dtype=False) + @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False), + torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False))) + @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False), + torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False))) + def test_atan2_type_promotion(self, device, dtypes): + dtype1, dtype2 = dtypes + default_float = torch.get_default_dtype() + + def is_int(dtype): + return dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + + def is_float(dtype): + return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False) + + def get_binary_float_result_type(x, y): + dtype1 = x.dtype + dtype2 = y.dtype + if is_float(dtype1) and is_float(dtype2): + return torch.result_type(x, y) + elif is_float(dtype1) and is_int(dtype2): + return dtype1 + elif is_int(dtype1) and is_float(dtype2): + return dtype2 + elif is_int(dtype1) and is_int(dtype2): + return default_float + + x = torch.tensor(1, dtype=dtype1, device=device) + y = torch.tensor(2, dtype=dtype2, device=device) + self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype) instantiate_device_type_tests(TestTypePromotion, globals()) From b470fa450038a5108f55894870373c763ff4c431 Mon Sep 17 00:00:00 2001 From: Hong Xu Date: Wed, 23 Sep 2020 23:01:01 -0700 Subject: [PATCH 080/449] Add complex number support for binary logical operators (#43174) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43174 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D23684425 Pulled By: mruberry fbshipit-source-id: 4857b16e18ec4c65327136badd7f04c74e32d330 --- aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 12 ++++++------ aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu | 9 ++++++--- c10/util/complex.h | 5 +++++ test/test_torch.py | 11 ----------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 09847a010ee3..67a961401fb0 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -237,14 +237,14 @@ void logical_and_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a && b; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(a && b); @@ -257,14 +257,14 @@ void logical_or_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a || b; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(a || b); @@ -277,14 +277,14 @@ void logical_xor_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return bool(a) != bool(b); }); }); } else { - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(bool(a) != bool(b)); diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu index 20a851d1b2ce..de11baa28210 100644 --- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu @@ -10,7 +10,8 @@ namespace at { namespace native { void logical_and_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_and_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_and_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a && b; }); @@ -18,7 +19,8 @@ void logical_and_kernel_cuda(TensorIterator& iter) { } void logical_or_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_or_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_or_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a || b; }); @@ -26,7 +28,8 @@ void logical_or_kernel_cuda(TensorIterator& iter) { } void logical_xor_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_xor_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_xor_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return bool(a) != bool(b); }); diff --git a/c10/util/complex.h b/c10/util/complex.h index 53ec4f30e539..9c63a2b296fb 100644 --- a/c10/util/complex.h +++ b/c10/util/complex.h @@ -257,6 +257,11 @@ struct alignas(sizeof(T) * 2) complex { } #endif + // consistent with NumPy behavior + explicit constexpr operator bool() const { + return real() || imag(); + } + constexpr T real() const { return real_; } diff --git a/test/test_torch.py b/test/test_torch.py index 70556dd2d2aa..ee27c8dd65cf 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6342,11 +6342,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): a = torch.tensor(a_, dtype=dtypes[0], device=device) b = torch.tensor(b_, dtype=dtypes[1], device=device) - if dtypes[0].is_complex or dtypes[1].is_complex: - with self.assertRaises(RuntimeError): - getattr(a, op)(b) - return - # new tensor self.assertEqual(expected_res.bool(), getattr(a, op)(b)) # out @@ -6361,12 +6356,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): getattr(a, op + '_')(b) return - # TODO: remove when complex ops are supported - if dtypes[0].is_complex: - with self.assertRaises(RuntimeError): - getattr(a, op + '_')(b) - return - getattr(a, op + '_')(b) self.assertEqual(expected_res, a) From 3dd0e362db0cadc37b377d3db898464e97e518d7 Mon Sep 17 00:00:00 2001 From: Alex Suhan Date: Wed, 23 Sep 2020 23:17:32 -0700 Subject: [PATCH 081/449] [TensorExpr] Fix min and max for integral inputs in CUDA backend (#44984) Summary: For integral types, isnan is meaningless. Provide specializations for maximum and minimum which don't call it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44984 Test Plan: python test/test_jit_fuser_te.py -k TestTEFuser.test_minmax_int_ops Reviewed By: ezyang Differential Revision: D23885259 Pulled By: asuhan fbshipit-source-id: 2e6da2c43c0ed18f0b648a2383d510894c574437 --- test/test_jit_fuser_te.py | 39 ++++++++++++++++++++++ torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 12 +++++-- torch/csrc/jit/tensorexpr/kernel.cpp | 4 +-- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index dc7e67a14ee2..6fab65006927 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -536,6 +536,45 @@ def apply(fn): " ".join(["Failed:", str(dtype), op.__name__, device]) ) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_minmax_int_ops(self): + def apply(fn): + return lambda x, y, z: fn(fn(x, y), z) + + dtypes = [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + torch.bool, + ] + binary_ops = [ + torch.min, + torch.max + ] + devices = ["cuda"] + for dtype, op, device in product(dtypes, binary_ops, devices): + try: + x = self.data_for(dtype, device) + y = self.data_for(dtype, device) + z = self.data_for(dtype, device) + fn = apply(op) + ref = fn(x, y, z) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x, y, z)) + self.assertEqual(ref, t(x, y, z)) + self.assertAllFused(t.graph_for(x, y, z)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device]) + ) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_comparison_eq_ne(self): def f(x, y): diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index a64e413657d0..06e6703d494a 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -453,7 +453,11 @@ void CudaPrinter::visit(const AtomicAdd* v) { } void CudaPrinter::visit(const Max* v) { - os() << "maximum("; + if (is_integral(v->dtype().scalar_type())) { + os() << "max("; + } else { + os() << "maximum("; + } v->lhs()->accept(this); os() << ","; v->rhs()->accept(this); @@ -461,7 +465,11 @@ void CudaPrinter::visit(const Max* v) { } void CudaPrinter::visit(const Min* v) { - os() << "minimum("; + if (is_integral(v->dtype().scalar_type())) { + os() << "min("; + } else { + os() << "minimum("; + } v->lhs()->accept(this); os() << ","; v->rhs()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 293ea780ed27..833881cc0e4f 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -816,14 +816,14 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::min: { return computeTwoOperand( "aten_min", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return Min::make(lhs, rhs, false); + return Min::make(boolToInteger(lhs), boolToInteger(rhs), false); }); } break; case aten::max: { return computeTwoOperand( "aten_max", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return Max::make(lhs, rhs, false); + return Max::make(boolToInteger(lhs), boolToInteger(rhs), false); }); } break; From b3d7c2f97859973c7282a772b811708379064d37 Mon Sep 17 00:00:00 2001 From: Negin Raoof Date: Wed, 23 Sep 2020 23:26:26 -0700 Subject: [PATCH 082/449] [ONNX] Update ONNX docs for release (#45086) Summary: ONNX doc updates. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45086 Reviewed By: ezyang Differential Revision: D23880383 Pulled By: bzinodev fbshipit-source-id: ca29782fd73024967ee7708c217a005233e7b970 --- docs/source/onnx.rst | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index ea45a2d7070a..3c07486b0e89 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -231,6 +231,25 @@ The dynamic control flow is captured correctly. We can verify in backends with d # [37, 37, 37]], dtype=int64)] +To avoid exporting a variable scalar tensor as a fixed value constant as part of the ONNX model, please +avoid use of ``torch.Tensor.item()``. Torch supports implicit cast of single-element tensors to numbers. +E.g.: :: + + class LoopModel(torch.nn.Module): + def forward(self, x, y): + res = [] + arr = x.split(2, 0) + for i in range(int(y)): + res += [arr[i].sum(0, False)] + return torch.stack(res) + + model = torch.jit.script(LoopModel()) + inputs = (torch.randn(16), torch.tensor(8)) + + out = model(*inputs) + torch.onnx.export(model, inputs, 'loop_and_list.onnx', opset_version=11, example_outputs=out) + + TorchVision support ------------------- @@ -262,6 +281,7 @@ The following operators are supported: * Conv * Dropout * Embedding (no optional arguments supported) +* EmbeddingBag * FeatureDropout (training mode not supported) * Index * MaxPool1d @@ -289,6 +309,7 @@ The following operators are supported: * avg_pool2d * avg_pool2d * avg_pool3d +* as_strided * baddbmm * bitshift * cat @@ -314,6 +335,7 @@ The following operators are supported: * exp * expand * expand_as +* eye * flatten * floor * floor_divide @@ -335,9 +357,11 @@ The following operators are supported: * instance_norm * interpolate * isnan +* KLDivLoss * layer_norm * le * leaky_relu +* len * log * log1p * log2 @@ -358,6 +382,9 @@ The following operators are supported: * narrow * ne * neg +* new_empty +* new_full +* new_zeros * nll_loss * nonzero * norm @@ -811,7 +838,10 @@ Q: Is tensor list exportable to ONNX? Yes, this is supported now for ONNX opset version >= 11. ONNX introduced the concept of Sequence in opset 11. Similar to list, Sequence is a data type that contains arbitrary number of Tensors. - Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. E.g.: :: + Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. + However, in-place list append within loops is not exportable to ONNX. To implement this, please use inplace + add operator. + E.g.: :: class ListLoopModel(torch.nn.Module): def forward(self, x): @@ -820,8 +850,8 @@ Q: Is tensor list exportable to ONNX? arr = x.split(2, 0) res2 = torch.zeros(3, 4, dtype=torch.long) for i in range(len(arr)): - res = res.append(arr[i].sum(0, False)) - res1 = res1.append(arr[-1 - i].sum(0, False)) + res += [arr[i].sum(0, False)] + res1 += [arr[-1 - i].sum(0, False)] res2 += 1 return torch.stack(res), torch.stack(res1), res2 From 29dc3c5ec821f5b9026e1c847c0ac605672e95af Mon Sep 17 00:00:00 2001 From: Alexander Date: Thu, 24 Sep 2020 00:05:25 -0700 Subject: [PATCH 083/449] Sparse softmax support (CUDA) (#42307) Summary: This PR implements softmax support for sparse tensors. Resolves gh-23651 for CUDA. - [x] sparse softmax - [x] CUDA C++ implementation - [x] unittests - [x] update softmax documentation - [x] autograd support - [x] sparse log_softmax - [x] CUDA C++ implementation - [x] unittests - [x] update log_softmax documentation - [x] autograd support Here are some benchmark (script is [here](https://gist.github.com/aocsa/fbc1827b3e49901512a33ba96092cbc1)) results for `torch.sparse.softmax and torch.softmax`, using CPU and GPU, values are float64 scalars, timing repeat is 1000: | size | density | sparse CUDA | sparse CPU | |--------------|---------|-------------|------------| | (32, 10000) | 0.01 | 380.2 | 687.5 | | (32, 10000) | 0.05 | 404.3 | 2357.9 | | (32, 10000) | 0.1 | 405.9 | 3677.2 | | (512, 10000) | 0.01 | 438.0 | 5443.4 | | (512, 10000) | 0.05 | 888.1 | 24485.0 | | (512, 10000) | 0.1 | 1921.3 | 45340.5 | | size | density | dense CUDA | dense CPU | |--------------|---------|-------------|------------| | (32, 10000) | 0.01 | 23.6 | 1943.2 | | (32, 10000) | 0.05 | 23.6 | 1954.0 | | (32, 10000) | 0.1 | 23.5 | 1950.0 | | (512, 10000) | 0.01 | 639.3 | 39797.9 | | (512, 10000) | 0.05 | 640.3 | 39374.4 | | (512, 10000) | 0.1 | 639.6 | 39192.3 | Times are in microseconds (us). Quick note: I updated the performance test again. Pull Request resolved: https://github.com/pytorch/pytorch/pull/42307 Reviewed By: ngimel Differential Revision: D23774427 Pulled By: mruberry fbshipit-source-id: bfabf726075b39dde544c10249f27ae1871f82c7 --- aten/src/ATen/native/native_functions.yaml | 4 + aten/src/ATen/native/sparse/ParamUtils.cpp | 53 ++ aten/src/ATen/native/sparse/ParamUtils.h | 24 + aten/src/ATen/native/sparse/SoftMax.cpp | 117 ++-- aten/src/ATen/native/sparse/cuda/SoftMax.cu | 641 ++++++++++++++++++++ test/test_sparse.py | 2 +- 6 files changed, 777 insertions(+), 64 deletions(-) create mode 100644 aten/src/ATen/native/sparse/ParamUtils.cpp create mode 100644 aten/src/ATen/native/sparse/ParamUtils.h create mode 100644 aten/src/ATen/native/sparse/cuda/SoftMax.cu diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index ae3579cd0aa9..f5bbb263ed9c 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -3676,11 +3676,13 @@ use_c10_dispatcher: full dispatch: SparseCPU: softmax_sparse_cpu + SparseCUDA: softmax_sparse_cuda - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor use_c10_dispatcher: full dispatch: SparseCPU: softmax_backward_sparse_cpu + SparseCUDA: softmax_backward_sparse_cuda - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -3693,11 +3695,13 @@ use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_sparse_cpu + SparseCUDA: log_softmax_sparse_cuda - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_backward_sparse_cpu + SparseCUDA: log_softmax_backward_sparse_cuda - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor use_c10_dispatcher: full diff --git a/aten/src/ATen/native/sparse/ParamUtils.cpp b/aten/src/ATen/native/sparse/ParamUtils.cpp new file mode 100644 index 000000000000..f2a4c97571b9 --- /dev/null +++ b/aten/src/ATen/native/sparse/ParamUtils.cpp @@ -0,0 +1,53 @@ +#include +#include +#include +#include + +namespace at { +namespace native { + +std::pair softmax_sparse_input_preprocessing( + const Tensor& input_, + const int64_t dim_, + const bool half_to_float, + CheckedFrom function_name) { + TORCH_INTERNAL_ASSERT(input_.is_sparse()); + TORCH_CHECK( + !half_to_float, + std::string(function_name) + + ": with half to float conversion is not supported on " + + input_.device().str()); + auto input = input_.coalesce(); + Tensor output = at::native::empty_like(input); + TORCH_CHECK( + dim_ >= 0 && dim_ < input.dim(), + ": dim must be non-negative and less than input dimensions"); + return std::make_pair(input, output); +} + +std::tuple softmax_backward_sparse_input_preprocessing( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_, + CheckedFrom function_name) { + TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; + checkSameSize(function_name, grad_arg, output_arg); + + int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); + + auto grad = grad_.coalesce(); + auto output = output_.coalesce(); + + Tensor grad_input = at::native::empty_like(output); + TORCH_CHECK( + dim >= 0 && dim < grad.dim(), + ": dim must be non-negative and less than input dimensions"); + TORCH_CHECK( + grad.sparse_dim() == output.sparse_dim(), + ": grad and output sparse dimensions must be equal"); + return std::make_tuple(grad_input, grad, output); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h new file mode 100644 index 000000000000..c9b2e3d999ad --- /dev/null +++ b/aten/src/ATen/native/sparse/ParamUtils.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace native { + +TORCH_API std::pair softmax_sparse_input_preprocessing( + const Tensor& input_, + const int64_t dim_, + const bool half_to_float, + CheckedFrom function_name); + +TORCH_API std::tuple softmax_backward_sparse_input_preprocessing( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_, + CheckedFrom function_name); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp index 1544c6e499e7..6070faf635c5 100644 --- a/aten/src/ATen/native/sparse/SoftMax.cpp +++ b/aten/src/ATen/native/sparse/SoftMax.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace at { @@ -291,10 +292,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di if (dim >= sparse_dim) { if (LogSoftMax) { auto new_values = log_softmax_cpu(values, dim - sparse_dim + 1, false); - out_values.copy_(new_values); + out_values.set_(new_values); } else { auto new_values = softmax_cpu(values, dim - sparse_dim + 1, false); - out_values.copy_(new_values); + out_values.set_(new_values); } return; } @@ -411,17 +412,27 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con auto grad_offsets = get_offsets(grad_indices, sizes, -1); if (dim >= sparse_dim) { - for(int64_t i=0; i= 0 && dim_ < input.dim(), - "dim must be non-negative and less than input dimensions"); AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] { - cpu_sparse_coo_softmax(output, input, dim_); + cpu_sparse_coo_softmax(output, input, dim); }); return output; } -Tensor log_softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) { - TORCH_INTERNAL_ASSERT(input_.is_sparse()); - TORCH_CHECK(!half_to_float, "log_softmax with half to float conversion is not supported on CPU"); - auto input = input_.coalesce(); - Tensor output = at::native::empty_like(input); +Tensor log_softmax_sparse_cpu( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "log_softmax"); if (input.numel() == 0) { return output; } - TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(), - "dim must be non-negative and less than input dimensions"); AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] { - cpu_sparse_coo_softmax(output, input, dim_); + cpu_sparse_coo_softmax(output, input, dim); }); return output; } @@ -542,26 +553,16 @@ Tensor softmax_backward_sparse_cpu( const Tensor& output_, int64_t dim_, const Tensor& input_) { - TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; - checkSameSize("softmax_backward", grad_arg, output_arg); - - int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); - - auto grad = grad_.coalesce(); - auto output = output_.coalesce(); - - Tensor grad_input = at::native::empty_like(output); + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "softmax_backward"); if (output.numel() == 0) { return grad_input; } - TORCH_CHECK( - dim >= 0 && dim < grad.dim(), - "dim must be non-negative and less than input dimensions"); - TORCH_CHECK( - grad.sparse_dim() == output.sparse_dim(), - "grad and output sparse dimensions must be equal"); AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { - cpu_sparse_coo_softmax_backward(grad_input, grad, output, dim); + cpu_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); }); return grad_input; } @@ -571,26 +572,16 @@ Tensor log_softmax_backward_sparse_cpu( const Tensor& output_, int64_t dim_, const Tensor& input_) { - TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; - checkSameSize("log_softmax_backward", grad_arg, output_arg); - - int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); - - auto grad = grad_.coalesce(); - auto output = output_.coalesce(); - - Tensor grad_input = at::native::empty_like(output); + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "log_softmax_backward"); if (output.numel() == 0) { return grad_input; } - TORCH_CHECK( - dim >= 0 && dim < grad.dim(), - "dim must be non-negative and less than input dimensions"); - TORCH_CHECK( - grad.sparse_dim() == output.sparse_dim(), - "grad and output sparse dimensions must be equal"); - AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { - cpu_sparse_coo_softmax_backward(grad_input, grad, output, dim); + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] { + cpu_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); }); return grad_input; } diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu new file mode 100644 index 000000000000..26cb6aba04e0 --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -0,0 +1,641 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { +namespace native { +namespace { + +// Number of threads in a block given an input size up to MAX_BLOCK_SIZE +static int getNumThreads(int nElem) { +#if defined(__HIP_PLATFORM_HCC__) + int threadSizes[5] = {16, 32, 64, 128, 256}; +#else + int threadSizes[5] = {32, 64, 128, 256, 512}; +#endif + for (int i = 0; i != 5; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return threadSizes[4]; +} + +template +__global__ void cuda_sparse_coo_softmax_kernel( + int64_t* sorted_pool_indices, + int64_t size, + int64_t* pool_sizes, + int64_t* pool_offsets, + int64_t nvalues, + scalar_t* mx_rows, + PackedTensorAccessor input_values_acc, + PackedTensorAccessor output_values_acc) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation of the sparse softmax algorithm that this implementation is + based on. + */ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int index = tid + blkid * blksz; + int step = blksz * gridsz; + + while (index < size) { + int64_t offset = pool_offsets[index]; + int64_t* pool_indices = sorted_pool_indices + offset; + int64_t pool_indices_size = pool_sizes[index]; + scalar_t* mx_row = mx_rows + index * nvalues; + + for (int64_t j = 0; j < nvalues; j++) { + scalar_t exp_sums = 0; + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto values_row = input_values_acc[i]; + auto out_values_row = output_values_acc[i]; + + auto v = c10::cuda::compat::exp(values_row[j] - mx_row[j]); + if (!LogSoftMax) { + out_values_row[j] = v; + } + exp_sums += v; + } + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto values_row = input_values_acc[i]; + auto out_values_row = output_values_acc[i]; + + if (LogSoftMax) { + out_values_row[j] = values_row[j] - mx_row[j] - c10::cuda::compat::log(exp_sums); + } else { + out_values_row[j] *= 1.0 / exp_sums; + } + } + } + index += step; + } +} + +template +__global__ void cuda_sparse_coo_softmax_backward_kernel( + int64_t* sorted_pool_indices, + int64_t size, + int64_t* pool_sizes, + int64_t* pool_offsets, + int64_t nvalues, + int64_t grad_nnz, + int64_t* grad_offsets, + int64_t* out_offsets, + int64_t* lower_bound_values, + PackedTensorAccessor values_accessor, + PackedTensorAccessor out_values_accessor, + PackedTensorAccessor grad_values_accessor) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for + the CPU implementation of the sparse softmax backward algorithm that this + implementation is based on. + */ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int index = tid + blkid * blksz; + int step = blksz * gridsz; + + while (index < size) { + int64_t offset = pool_offsets[index]; + int64_t* pool_indices = sorted_pool_indices + offset; + int64_t pool_indices_size = pool_sizes[index]; + + for (int64_t k = 0; k < nvalues; k++) { + scalar_t tmp_row{0}; + + /* Compute tmp = - sum_j output_j * grad_j */ + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto out_values_row = out_values_accessor[i]; + auto j = lower_bound_values[i]; + + /* Update `tmp_row` accumulator only when limits and pools are valid */ + if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) { + auto grad_values_row = grad_values_accessor[j]; + if (LogSoftMax) { + tmp_row -= grad_values_row[k]; + } else { + tmp_row -= out_values_row[k] * grad_values_row[k]; + } + } + } + + /* Compute grad_input = output * (grad + tmp)*/ + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto out_values_row = out_values_accessor[i]; + auto values_row = values_accessor[i]; + auto j = lower_bound_values[i]; + if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) { + auto grad_values_row = grad_values_accessor[j]; + if (LogSoftMax) { + values_row[k] = grad_values_row[k] + + c10::cuda::compat::exp(out_values_row[k]) * tmp_row; + } else { + values_row[k] = + out_values_row[k] * (grad_values_row[k] + tmp_row); + } + } else { + if (LogSoftMax) { + values_row[k] = + c10::cuda::compat::exp(out_values_row[k]) * tmp_row; + } else { + values_row[k] = out_values_row[k] * tmp_row; + } + } + } + } + index += step; + } +} + +using thrust_ptr = thrust::device_ptr; + +Tensor get_offsets( + const Tensor& indices, + const IntArrayRef& sizes, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:get_offsets for the CPU + implementation of get_offsets function that this implementation is based on. + */ + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto ndim = indices.size(0); + auto nnz = indices.size(1); + std::vector host_strides(ndim, 1); + if (ndim > 1) { + for (int64_t i = ndim - 2; i >= 0; i--) { + host_strides[i] = + host_strides[i + 1] * (i + 1 == dim ? 1 : sizes[i + 1]); + } + } + auto strides = at::empty({ndim}, indices.options()); + auto strides_ptr = strides.data_ptr(); + + AT_CUDA_CHECK(cudaMemcpyAsync( + strides_ptr, host_strides.data(), host_strides.size() * sizeof(int64_t), + cudaMemcpyHostToDevice, + stream)); + + auto indices_accessor = indices.packed_accessor(); + + Tensor offsets = at::empty({nnz}, indices.options()); + + thrust::transform( + policy, + thrust::make_counting_iterator(int64_t(0)), + thrust::make_counting_iterator(int64_t(nnz)), + thrust::device_ptr(offsets.data_ptr()), + [indices_accessor, strides_ptr, dim, ndim] __device__(int64_t x) { + int64_t pool_index = 0; + for (int64_t j = 0; j < ndim; j++) { + if (j != dim) { + auto indices_row = indices_accessor[j]; + auto stride = strides_ptr[j]; + pool_index += stride * indices_row[x]; + } + } + return pool_index; + }); + return offsets; +} + +template +std::tuple compute_pool_max( + const Tensor& indices, + const Tensor& values, + const IntArrayRef& sizes, + int64_t nvalues, + const int64_t dim) { + /* + Return pools of indices that align with the given dimension and the + corresponding max values for each pool. + + See ATen/native/sparse/Softmax.cpp:get_offsets and + ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation that this implementation is based on. + */ + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto nnz = indices.size(1); + auto offsets = get_offsets(indices, sizes, dim); + int64_t* offsets_ptr = offsets.data_ptr(); + + auto sorted_indices = at::empty({nnz}, indices.options()); + thrust_ptr sorted_indices_thrust_ptr(sorted_indices.data_ptr()); + thrust::sequence( + policy, sorted_indices_thrust_ptr, sorted_indices_thrust_ptr + nnz, 0); + + thrust::sort( + policy, + sorted_indices_thrust_ptr, + sorted_indices_thrust_ptr + nnz, + [offsets_ptr] __device__(int64_t x, int64_t y) { + return offsets_ptr[x] < offsets_ptr[y]; + }); + auto pool_sizes = at::empty({nnz}, indices.options()); + + auto new_end = thrust::reduce_by_key( + policy, + sorted_indices_thrust_ptr, + sorted_indices_thrust_ptr + nnz, + thrust::make_constant_iterator(int64_t(1)), + thrust::make_discard_iterator(), + thrust_ptr(pool_sizes.data_ptr()), + [offsets_ptr] __device__(int64_t x, int64_t y) { + return offsets_ptr[x] == offsets_ptr[y]; + }); + auto new_sz = thrust::distance( + thrust_ptr(pool_sizes.data_ptr()), new_end.second); + pool_sizes.resize_({new_sz}); + + auto pool_offsets = pool_sizes.clone(); + thrust_ptr pool_offsets_thrust_ptr( + pool_offsets.data_ptr()); + thrust::exclusive_scan( + policy, + pool_offsets_thrust_ptr, + pool_offsets_thrust_ptr + new_sz, + pool_offsets_thrust_ptr); + + Tensor mx_buffer; + if (requireMxRows) { + + auto values_accessor = + values.packed_accessor(); // {nnz, nvalues} + + mx_buffer = at::full({new_sz * nvalues}, Scalar(-std::numeric_limits::infinity()), values.options()); + + auto mx_buffer_ptr = mx_buffer.data_ptr(); + + auto pool_sizes_ptr = pool_sizes.data_ptr(); + auto sorted_indices_ptr = sorted_indices.data_ptr(); + auto pool_offsets_ptr = pool_offsets.data_ptr(); + + thrust::for_each( + policy, + thrust::make_counting_iterator(int64_t(0)), + thrust::make_counting_iterator(int64_t(new_sz)), + [values_accessor, + sorted_indices_ptr, + pool_sizes_ptr, + pool_offsets_ptr, + mx_buffer_ptr, + nvalues] __device__(int64_t index) { + int64_t curr_pool_size = pool_sizes_ptr[index]; + auto mx_row = mx_buffer_ptr + index * nvalues; + int64_t offset = pool_offsets_ptr[index]; + for (int64_t p = 0; p < curr_pool_size; p++) { + int64_t i = *(sorted_indices_ptr + offset + p); + auto values_row = values_accessor[i].data(); + for (int64_t j = 0; j < nvalues; j++) { + mx_row[j] = c10::cuda::compat::max(mx_row[j], values_row[j]); + } + } + }); + } + return std::make_tuple( + sorted_indices, pool_offsets, pool_sizes, mx_buffer); +} + +template +void cuda_sparse_coo_softmax( + Tensor& output, + const Tensor& input, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation of the sparse softmax algorithm that this implementation is + based on. + */ + auto sparse_dim = input.sparse_dim(); + auto indices = input._indices().contiguous(); + auto values = input._values().contiguous(); + auto out_values = output._values(); + auto out_indices = output._indices(); + out_values.resize_as_(values); + out_indices.resize_as_(indices); + out_indices.copy_(indices); + + if (dim >= sparse_dim) { + if (LogSoftMax) { + auto new_values = log_softmax_cuda(values, dim - sparse_dim + 1, false); + out_values.set_(new_values); + } else { + auto new_values = softmax_cuda(values, dim - sparse_dim + 1, false); + out_values.set_(new_values); + } + return; + } + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto nnz = values.size(0); + auto sizes = input.sizes(); + auto nvalues = values.numel() / nnz; + + /* Prepare accessors */ + auto values_2 = values.view({nnz, nvalues}); + auto values_accessor = values_2.packed_accessor(); + + auto out_values_2 = out_values.view({nnz, nvalues}); + auto out_values_accessor = out_values_2.packed_accessor(); + + Tensor sorted_indices; + Tensor pool_offsets; + Tensor pool_sizes; + Tensor mx_buffer; + + std::tie(sorted_indices, pool_offsets, pool_sizes, mx_buffer) = + compute_pool_max(indices, values_2, sizes, nvalues, dim); + + auto pool_size = pool_offsets.size(0); + int block_size = getNumThreads(pool_size); + const int grid_size = (pool_size + block_size - 1) / block_size; + + cuda_sparse_coo_softmax_kernel + <<>>( + sorted_indices.data_ptr(), + pool_size, + pool_sizes.data_ptr(), + pool_offsets.data_ptr(), + nvalues, + mx_buffer.data_ptr(), + values_accessor, + out_values_accessor); + THCudaCheck(cudaGetLastError()); +} + +template +void cuda_sparse_coo_softmax_backward( + Tensor& grad_input, + const Tensor& grad, + const Tensor& output, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for + the CPU implementation of the sparse softmax backward algorithm that this + implementation is based on. + */ + auto sparse_dim = output.sparse_dim(); + auto sizes = output.sizes().vec(); + auto grad_indices = grad._indices().contiguous(); + auto grad_values = grad._values().contiguous(); + auto out_indices = output._indices().contiguous(); + auto out_values = output._values().contiguous(); + auto values = grad_input._values(); + auto indices = grad_input._indices(); + auto out_nnz = out_values.size(0); + auto grad_nnz = grad_values.size(0); + + values.resize_as_(out_values); + values.zero_(); + indices.resize_as_(out_indices); + indices.copy_(out_indices); + + auto out_offsets = get_offsets(out_indices, sizes, -1); + auto grad_offsets = get_offsets(grad_indices, sizes, -1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + /* when dim >= sparse_dim the dense backward is used */ + if (dim >= sparse_dim) { + if (at::native::cuda_equal(out_offsets, grad_offsets) == true) { + Tensor unused = at::native::empty_like(grad_values); + if (LogSoftMax) { + auto r = log_softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused); + values.set_(r); + } else { + auto r = softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused); + values.set_(r); + } + } else { + auto host_out_offsets = + out_offsets.to(at::Device(kCPU), indices.dtype(), false, true); + auto host_grad_offsets = + grad_offsets.to(at::Device(kCPU), indices.dtype(), false, true); + auto out_offsets_accessor = host_out_offsets.data_ptr(); + auto grad_offsets_accessor = host_grad_offsets.data_ptr(); + for (int64_t i = 0; i < out_nnz; i++) { + Tensor unused = at::native::empty_like(grad_values); + auto low = thrust::lower_bound( + grad_offsets_accessor, + grad_offsets_accessor + grad_offsets.size(0), + out_offsets_accessor[i]); + auto j = low - grad_offsets_accessor; + /* + Compute output using dense backward only when limits and pools are valid + If this check is false then a sparse tensor with full of zeros is returned + */ + if (j < grad_nnz && out_offsets_accessor[i] == grad_offsets_accessor[j]) { + if (LogSoftMax) { + auto r = log_softmax_backward_cuda( + grad_values[j], out_values[i], dim - sparse_dim, unused); + values[i].copy_(r); + } else { + auto r = softmax_backward_cuda( + grad_values[j], out_values[i], dim - sparse_dim, unused); + values[i].copy_(r); + } + } + } + } + return; + } + + auto nnz = values.size(0); + auto nvalues = values.numel() / nnz; + + auto values_2 = values.view({nnz, nvalues}); + auto values_accessor = values_2.packed_accessor(); + + auto out_values_2 = out_values.view({out_nnz, nvalues}); + auto out_values_accessor = out_values_2.packed_accessor(); + + auto grad_values_2 = grad_values.view({grad_nnz, nvalues}); + auto grad_values_accessor = grad_values_2.packed_accessor(); + + Tensor lower_bound_values = + at::empty({out_offsets.size(0)}, indices.options()); + + thrust::lower_bound( + policy, + thrust_ptr(grad_offsets.data_ptr()), + thrust_ptr(grad_offsets.data_ptr() + grad_offsets.size(0)), + thrust_ptr(out_offsets.data_ptr()), + thrust_ptr(out_offsets.data_ptr()) + out_offsets.size(0), + thrust_ptr(lower_bound_values.data_ptr())); + + Tensor sorted_indices; + Tensor pool_offsets; + Tensor pool_sizes; + + /* Compute independent pools of indices */ + std::tie( + sorted_indices, pool_offsets, pool_sizes, std::ignore) = + compute_pool_max( + out_indices, values_2, sizes, nvalues, dim); + + auto pool_size = pool_offsets.size(0); + + int block_size = getNumThreads(pool_size); + const int grid_size = (pool_size + block_size - 1) / block_size; + + cuda_sparse_coo_softmax_backward_kernel + <<>>( + sorted_indices.data_ptr(), + pool_size, + pool_sizes.data_ptr(), + pool_offsets.data_ptr(), + nvalues, + grad_nnz, + grad_offsets.data_ptr(), + out_offsets.data_ptr(), + lower_bound_values.data_ptr(), + values_accessor, + out_values_accessor, + grad_values_accessor); + THCudaCheck(cudaGetLastError()); +} + +} // end anonymous namespace + +Tensor softmax_sparse_cuda( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "softmax"); + if (input.numel() == 0) { + return output; + } + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] { + cuda_sparse_coo_softmax(output, input, dim); + }); + return output; +} + +Tensor log_softmax_sparse_cuda( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "log_softmax"); + if (input.numel() == 0) { + return output; + } + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] { + cuda_sparse_coo_softmax(output, input, dim); + }); + return output; +} + +Tensor softmax_backward_sparse_cuda( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "softmax_backward"); + if (output.numel() == 0) { + return grad_input; + } + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { + cuda_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); + }); + return grad_input; +} + +Tensor log_softmax_backward_sparse_cuda( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "log_softmax_backward"); + if (output.numel() == 0) { + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] { + cuda_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); + }); + return grad_input; +} + +} // namespace native +} // namespace at diff --git a/test/test_sparse.py b/test/test_sparse.py index af833be6810c..2a0e76afe36a 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -2589,7 +2589,7 @@ def test_sparse_to_numpy(self): t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4])) self.assertRaises(TypeError, lambda: t.numpy()) - @cpu_only + @skipIfRocm def test_softmax(self): import torch.nn.functional as F From 6d21d5f0b33c755f715efa1ed498c017629fcd93 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 24 Sep 2020 00:19:05 -0700 Subject: [PATCH 084/449] gtest-ify JIT tests, through the letter c (#45249) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45249 Reland of https://github.com/pytorch/pytorch/pull/45055 and https://github.com/pytorch/pytorch/pull/45020 See https://github.com/pytorch/pytorch/pull/45018 for context. Test Plan: Imported from OSS Reviewed By: jamesr66a Differential Revision: D23892645 Pulled By: suo fbshipit-source-id: e7fe58d5e1a5a0c44f4e2aec9694145afabde0fd --- test/cpp/jit/CMakeLists.txt | 7 +- test/cpp/jit/test_autodiff.cpp | 9 +- test/cpp/jit/test_class_import.cpp | 12 +- test/cpp/jit/test_class_parser.cpp | 4 +- test/cpp/jit/test_cleanup_passes.cpp | 37 +- test/cpp/jit/test_code_template.cpp | 50 ++- test/cpp/jit/test_constant_pooling.cpp | 87 ++--- .../jit/test_create_autodiff_subgraphs.cpp | 5 +- test/cpp/jit/test_custom_class.cpp | 315 +--------------- .../jit/test_custom_class_registrations.cpp | 291 +++++++++++++++ .../cpp/jit/test_custom_class_registrations.h | 36 ++ test/cpp/jit/test_custom_operators.cpp | 342 +++++++++--------- test/cpp/jit/test_dce.cpp | 6 +- test/cpp/jit/test_fuser.cpp | 41 ++- test/cpp/jit/test_misc.cpp | 10 + test/cpp/jit/tests.h | 23 +- 16 files changed, 641 insertions(+), 634 deletions(-) create mode 100644 test/cpp/jit/test_custom_class_registrations.cpp create mode 100644 test/cpp/jit/test_custom_class_registrations.h diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index 84f7193ad8c0..b8f6ef195226 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -2,7 +2,10 @@ set(JIT_TEST_ROOT ${TORCH_ROOT}/test/cpp/jit) # Build separate libraries the define custom classes/operators used from our Python tests. # These are intended to be used with torch.ops.load_library() in our Python test suite. -add_library(torchbind_test SHARED ${JIT_TEST_ROOT}/test_custom_class.cpp) +add_library(torchbind_test SHARED + ${JIT_TEST_ROOT}/test_custom_class_registrations.h + ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp +) target_link_libraries(torchbind_test torch) add_library(jitbackend_test SHARED ${JIT_TEST_ROOT}/test_backend.cpp) @@ -30,6 +33,8 @@ set(JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_cleanup_passes.cpp ${JIT_TEST_ROOT}/test_create_autodiff_subgraphs.cpp ${JIT_TEST_ROOT}/test_custom_class.cpp + ${JIT_TEST_ROOT}/test_custom_class_registrations.h + ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp ${JIT_TEST_ROOT}/test_custom_operators.cpp ${JIT_TEST_ROOT}/test_dce.cpp ${JIT_TEST_ROOT}/test_fuser.cpp diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp index 7d431776a971..3993c63b1708 100644 --- a/test/cpp/jit/test_autodiff.cpp +++ b/test/cpp/jit/test_autodiff.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/tracer.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" @@ -83,7 +84,7 @@ variable_list grad( fmap(inputs, get_edge)); } -void testADFormulas() { +TEST(AutodiffTest, ADFormulas) { const auto cast = [](const Variable& v) { return static_cast(v); }; @@ -174,7 +175,7 @@ void testADFormulas() { } } -void testDifferentiate() { +TEST(AutodiffTest, Differentiate) { // Note: can't use IRParser for this test due to issue #23989 auto graph = std::make_shared(); std::vector sizes{2, 3, 4}; @@ -229,7 +230,7 @@ void testDifferentiate() { ->run(*grad_spec.df); } -void testDifferentiateWithRequiresGrad() { +TEST(AutodiffTest, DifferentiateWithRequiresGrad) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp index 82bc0cf3bccc..ffa845b3e2a8 100644 --- a/test/cpp/jit/test_class_import.cpp +++ b/test/cpp/jit/test_class_import.cpp @@ -1,7 +1,7 @@ -#include -#include +#include #include +#include #include #include #include @@ -45,7 +45,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -void testClassImport() { +TEST(ClassImportTest, Basic) { auto cu1 = std::make_shared(); auto cu2 = std::make_shared(); std::vector constantTable; @@ -80,7 +80,7 @@ void testClassImport() { ASSERT_FALSE(c); } -void testScriptObject() { +TEST(ClassImportTest, ScriptObject) { Module m1("m1"); Module m2("m2"); std::vector constantTable; @@ -114,7 +114,7 @@ def __init__(self, x): return x )JIT"; -void testClassDerive() { +TEST(ClassImportTest, ClassDerive) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu); const auto self = SimpleSelf(cls); @@ -142,7 +142,7 @@ class FooBar1234(Module): return (self.f).top() )JIT"; -void testSaveLoadTorchbind() { +TEST(ClassImportTest, CustomClass) { auto cu1 = std::make_shared(); std::vector constantTable; // Import different versions of FooTest into two namespaces. diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp index 45e37103bb5a..a5b19f63fd3f 100644 --- a/test/cpp/jit/test_class_parser.cpp +++ b/test/cpp/jit/test_class_parser.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -15,7 +17,7 @@ const auto testSource = R"JIT( an_attribute : Tensor )JIT"; -void testClassParser() { +TEST(ClassParserTest, Basic) { Parser p(std::make_shared(testSource)); std::vector definitions; std::vector resolvers; diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp index 2f2ca4e0a19b..38ceef932eb0 100644 --- a/test/cpp/jit/test_cleanup_passes.cpp +++ b/test/cpp/jit/test_cleanup_passes.cpp @@ -1,19 +1,19 @@ +#include + #include #include #include #include -#include "test/cpp/jit/test_base.h" namespace torch { namespace jit { -void testCleanUpPasses() { +TEST(CleanupPassTest, Basic) { // Tests stability of clean up passes when dealing with constant pooling // and constant propagation. - { - auto graph = std::make_shared(); - parseIR( - R"IR( + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond.1 : Tensor, %suffix.1 : str): %3 : bool = aten::Bool(%cond.1) # o.py:6:7 @@ -31,20 +31,19 @@ graph(%cond.1 : Tensor, -> (%12) return (%25) )IR", - &*graph); - runCleanupPasses(graph); - testing::FileCheck() - .check_count( - "prim::Constant[value=\"same string with a twist\"]", - 1, - /*exactly=*/true) - ->run(*graph); + &*graph); + runCleanupPasses(graph); + testing::FileCheck() + .check_count( + "prim::Constant[value=\"same string with a twist\"]", + 1, + /*exactly=*/true) + ->run(*graph); - auto graph_after_pass_once = graph->toString(); - runCleanupPasses(graph); - auto graph_after_pass_twice = graph->toString(); - ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); - } + auto graph_after_pass_once = graph->toString(); + runCleanupPasses(graph); + auto graph_after_pass_twice = graph->toString(); + ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp index e4d7d1ef856e..bf539e3d169f 100644 --- a/test/cpp/jit/test_code_template.cpp +++ b/test/cpp/jit/test_code_template.cpp @@ -1,6 +1,6 @@ -#include "test/cpp/jit/test_base.h" -#include "test/cpp/jit/test_utils.h" +#include +#include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/code_template.h" namespace torch { @@ -33,31 +33,29 @@ static const auto ct_expect = R"( int notest(int a) )"; -void testCodeTemplate() { - { - TemplateEnv e; - e.s("hi", "foo"); - e.v("what", {"is", "this"}); - TemplateEnv c(e); - c.s("hi", "foo2"); - ASSERT_EQ(e.s("hi"), "foo"); - ASSERT_EQ(c.s("hi"), "foo2"); - ASSERT_EQ(e.v("what")[0], "is"); - } +TEST(TestCodeTemplate, Copying) { + TemplateEnv e; + e.s("hi", "foo"); + e.v("what", {"is", "this"}); + TemplateEnv c(e); + c.s("hi", "foo2"); + ASSERT_EQ(e.s("hi"), "foo"); + ASSERT_EQ(c.s("hi"), "foo2"); + ASSERT_EQ(e.v("what")[0], "is"); +} - { - TemplateEnv e; - e.v("args", {"hi", "8"}); - e.v("bar", {"what\non many\nlines...", "7"}); - e.s("a", "3"); - e.s("b", "4"); - e.v("stuff", {"things...", "others"}); - e.v("empty", {}); - auto s = ct.format(e); - // std::cout << "'" << s << "'\n"; - // std::cout << "'" << ct_expect << "'\n"; - ASSERT_EQ(s, ct_expect); - } +TEST(TestCodeTemplate, Formatting) { + TemplateEnv e; + e.v("args", {"hi", "8"}); + e.v("bar", {"what\non many\nlines...", "7"}); + e.s("a", "3"); + e.s("b", "4"); + e.v("stuff", {"things...", "others"}); + e.v("empty", {}); + auto s = ct.format(e); + // std::cout << "'" << s << "'\n"; + // std::cout << "'" << ct_expect << "'\n"; + ASSERT_EQ(s, ct_expect); } } // namespace jit diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp index b949c9a45b25..c8cb58e1886a 100644 --- a/test/cpp/jit/test_constant_pooling.cpp +++ b/test/cpp/jit/test_constant_pooling.cpp @@ -1,9 +1,10 @@ +#include + #include #include #include #include #include -#include "test/cpp/jit/test_base.h" #include #include @@ -11,26 +12,26 @@ namespace torch { namespace jit { -void testConstantPooling() { - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(ConstantPoolingTest, Int) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %8 : int = prim::Constant[value=1]() %10 : int = prim::Constant[value=1]() return (%8, %10) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingAcrossBlocks) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond : Tensor): %a : str = prim::Constant[value="bcd"]() %3 : bool = aten::Bool(%cond) @@ -44,17 +45,18 @@ graph(%cond : Tensor): %7 : (str, str) = prim::TupleConstruct(%a, %b) return (%7) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) - ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) + ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingDifferentDevices) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %2 : int = prim::Constant[value=2]() %1 : int = prim::Constant[value=1]() @@ -70,22 +72,21 @@ graph(): prim::Print(%x, %y, %z) return (%1) )IR", - &*graph); - // three tensors created - two different devices among the three - // don't have good support for parsing tensor constants - ConstantPropagation(graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count( - "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->check_count( - "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->run(*graph); - } + &*graph); + // three tensors created - two different devices among the three + // don't have good support for parsing tensor constants + ConstantPropagation(graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count( + "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->check_count( + "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->run(*graph); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp index 8da6d9d6a1b2..e97043f84d24 100644 --- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp +++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" @@ -6,7 +7,7 @@ namespace torch { namespace jit { -void testCreateAutodiffSubgraphs() { +TEST(CreateAutodiffSubgraphsTest, Basic) { auto graph = build_lstm(); CreateAutodiffSubgraphs(graph, /*threshold=*/2); // all of the ops are within the DifferentiableGraph diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp index 543fbc20eb3d..a96a3b4a5635 100644 --- a/test/cpp/jit/test_custom_class.cpp +++ b/test/cpp/jit/test_custom_class.cpp @@ -1,3 +1,6 @@ +#include + +#include #include #include @@ -8,317 +11,7 @@ namespace torch { namespace jit { -namespace { - -struct Foo : torch::CustomClassHolder { - int x, y; - Foo() : x(0), y(0) {} - Foo(int x_, int y_) : x(x_), y(y_) {} - int64_t info() { - return this->x * this->y; - } - int64_t add(int64_t z) { - return (x + y) * z; - } - void increment(int64_t z) { - this->x += z; - this->y += z; - } - int64_t combine(c10::intrusive_ptr b) { - return this->info() + b->info(); - } - ~Foo() { - // std::cout<<"Destroying object with values: "< -struct MyStackClass : torch::CustomClassHolder { - std::vector stack_; - MyStackClass(std::vector init) : stack_(init.begin(), init.end()) {} - - void push(T x) { - stack_.push_back(x); - } - T pop() { - auto val = stack_.back(); - stack_.pop_back(); - return val; - } - - c10::intrusive_ptr clone() const { - return c10::make_intrusive(stack_); - } - - void merge(const c10::intrusive_ptr& c) { - for (auto& elem : c->stack_) { - push(elem); - } - } - - std::tuple return_a_tuple() const { - return std::make_tuple(1337.0f, 123); - } -}; - -struct PickleTester : torch::CustomClassHolder { - PickleTester(std::vector vals) : vals(std::move(vals)) {} - std::vector vals; -}; - -at::Tensor take_an_instance(const c10::intrusive_ptr& instance) { - return torch::zeros({instance->vals.back(), 4}); -} - -struct ElementwiseInterpreter : torch::CustomClassHolder { - using InstructionType = std::tuple< - std::string /*op*/, - std::vector /*inputs*/, - std::string /*output*/>; - - ElementwiseInterpreter() {} - - // Load a list of instructions into the interpreter. As specified above, - // instructions specify the operation (currently support "add" and "mul"), - // the names of the input values, and the name of the single output value - // from this instruction - void setInstructions(std::vector instructions) { - instructions_ = std::move(instructions); - } - - // Add a constant. The interpreter maintains a set of constants across - // calls. They are keyed by name, and constants can be referenced in - // Instructions by the name specified - void addConstant(const std::string& name, at::Tensor value) { - constants_.insert_or_assign(name, std::move(value)); - } - - // Set the string names for the positional inputs to the function this - // interpreter represents. When invoked, the interpreter will assign - // the positional inputs to the names in the corresponding position in - // input_names. - void setInputNames(std::vector input_names) { - input_names_ = std::move(input_names); - } - - // Specify the output name for the function this interpreter represents. This - // should match the "output" field of one of the instructions in the - // instruction list, typically the last instruction. - void setOutputName(std::string output_name) { - output_name_ = std::move(output_name); - } - - // Invoke this interpreter. This takes a list of positional inputs and returns - // a single output. Currently, inputs and outputs must all be Tensors. - at::Tensor __call__(std::vector inputs) { - // Environment to hold local variables - std::unordered_map environment; - - // Load inputs according to the specified names - if (inputs.size() != input_names_.size()) { - std::stringstream err; - err << "Expected " << input_names_.size() << " inputs, but got " - << inputs.size() << "!"; - throw std::runtime_error(err.str()); - } - for (size_t i = 0; i < inputs.size(); ++i) { - environment[input_names_[i]] = inputs[i]; - } - - for (InstructionType& instr : instructions_) { - // Retrieve all input values for this op - std::vector inputs; - for (const auto& input_name : std::get<1>(instr)) { - // Operator output values shadow constants. - // Imagine all constants are defined in statements at the beginning - // of a function (a la K&R C). Any definition of an output value must - // necessarily come after constant definition in textual order. Thus, - // We look up values in the environment first then the constant table - // second to implement this shadowing behavior - if (environment.find(input_name) != environment.end()) { - inputs.push_back(environment.at(input_name)); - } else if (constants_.find(input_name) != constants_.end()) { - inputs.push_back(constants_.at(input_name)); - } else { - std::stringstream err; - err << "Instruction referenced unknown value " << input_name << "!"; - throw std::runtime_error(err.str()); - } - } - - // Run the specified operation - at::Tensor result; - const auto& op = std::get<0>(instr); - if (op == "add") { - if (inputs.size() != 2) { - throw std::runtime_error("Unexpected number of inputs for add op!"); - } - result = inputs[0] + inputs[1]; - } else if (op == "mul") { - if (inputs.size() != 2) { - throw std::runtime_error("Unexpected number of inputs for mul op!"); - } - result = inputs[0] * inputs[1]; - } else { - std::stringstream err; - err << "Unknown operator " << op << "!"; - throw std::runtime_error(err.str()); - } - - // Write back result into environment - const auto& output_name = std::get<2>(instr); - environment[output_name] = std::move(result); - } - - if (!output_name_) { - throw std::runtime_error("Output name not specififed!"); - } - - return environment.at(*output_name_); - } - - // Ser/De infrastructure. See - // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes - // for more info. - - // This is the type we will use to marshall information on disk during - // ser/de. It is a simple tuple composed of primitive types and simple - // collection types like vector, optional, and dict. - using SerializationType = std::tuple< - std::vector /*input_names_*/, - c10::optional /*output_name_*/, - c10::Dict /*constants_*/, - std::vector /*instructions_*/ - >; - - // This function yields the SerializationType instance for `this`. - SerializationType __getstate__() const { - return SerializationType{ - input_names_, output_name_, constants_, instructions_}; - } - - // This function will create an instance of `ElementwiseInterpreter` given - // an instance of `SerializationType`. - static c10::intrusive_ptr __setstate__( - SerializationType state) { - auto instance = c10::make_intrusive(); - std::tie( - instance->input_names_, - instance->output_name_, - instance->constants_, - instance->instructions_) = std::move(state); - return instance; - } - - // Class members - std::vector input_names_; - c10::optional output_name_; - c10::Dict constants_; - std::vector instructions_; -}; - -TORCH_LIBRARY(_TorchScriptTesting, m) { - m.class_("_Foo") - .def(torch::init()) - // .def(torch::init<>()) - .def("info", &Foo::info) - .def("increment", &Foo::increment) - .def("add", &Foo::add) - .def("combine", &Foo::combine); - - m.class_("_NoInit").def( - "get_x", [](const c10::intrusive_ptr& self) { return self->x; }); - - m.class_>("_StackString") - .def(torch::init>()) - .def("push", &MyStackClass::push) - .def("pop", &MyStackClass::pop) - .def("clone", &MyStackClass::clone) - .def("merge", &MyStackClass::merge) - .def_pickle( - [](const c10::intrusive_ptr>& self) { - return self->stack_; - }, - [](std::vector state) { // __setstate__ - return c10::make_intrusive>( - std::vector{"i", "was", "deserialized"}); - }) - .def("return_a_tuple", &MyStackClass::return_a_tuple) - .def( - "top", - [](const c10::intrusive_ptr>& self) - -> std::string { return self->stack_.back(); }) - .def( - "__str__", - [](const c10::intrusive_ptr>& self) { - std::stringstream ss; - ss << "["; - for (size_t i = 0; i < self->stack_.size(); ++i) { - ss << self->stack_[i]; - if (i != self->stack_.size() - 1) { - ss << ", "; - } - } - ss << "]"; - return ss.str(); - }); - // clang-format off - // The following will fail with a static assert telling you you have to - // take an intrusive_ptr as the first argument. - // .def("foo", [](int64_t a) -> int64_t{ return 3;}); - // clang-format on - - m.class_("_PickleTester") - .def(torch::init>()) - .def_pickle( - [](c10::intrusive_ptr self) { // __getstate__ - return std::vector{1, 3, 3, 7}; - }, - [](std::vector state) { // __setstate__ - return c10::make_intrusive(std::move(state)); - }) - .def( - "top", - [](const c10::intrusive_ptr& self) { - return self->vals.back(); - }) - .def("pop", [](const c10::intrusive_ptr& self) { - auto val = self->vals.back(); - self->vals.pop_back(); - return val; - }); - - m.def( - "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y", - take_an_instance); - // test that schema inference is ok too - m.def("take_an_instance_inferred", take_an_instance); - - m.class_("_ElementwiseInterpreter") - .def(torch::init<>()) - .def("set_instructions", &ElementwiseInterpreter::setInstructions) - .def("add_constant", &ElementwiseInterpreter::addConstant) - .def("set_input_names", &ElementwiseInterpreter::setInputNames) - .def("set_output_name", &ElementwiseInterpreter::setOutputName) - .def("__call__", &ElementwiseInterpreter::__call__) - .def_pickle( - /* __getstate__ */ - [](const c10::intrusive_ptr& self) { - return self->__getstate__(); - }, - /* __setstate__ */ - [](ElementwiseInterpreter::SerializationType state) { - return ElementwiseInterpreter::__setstate__(std::move(state)); - }); -} - -} // namespace - -void testTorchbindIValueAPI() { +TEST(CustomClassTest, TorchbindIValueAPI) { script::Module m("m"); // test make_custom_class API diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp new file mode 100644 index 000000000000..f563120bbc6c --- /dev/null +++ b/test/cpp/jit/test_custom_class_registrations.cpp @@ -0,0 +1,291 @@ +#include + +#include +#include + +#include +#include +#include + +using namespace torch::jit; + +namespace { + +struct Foo : torch::CustomClassHolder { + int x, y; + Foo() : x(0), y(0) {} + Foo(int x_, int y_) : x(x_), y(y_) {} + int64_t info() { + return this->x * this->y; + } + int64_t add(int64_t z) { + return (x + y) * z; + } + void increment(int64_t z) { + this->x += z; + this->y += z; + } + int64_t combine(c10::intrusive_ptr b) { + return this->info() + b->info(); + } + ~Foo() { + // std::cout<<"Destroying object with values: "< vals) : vals(std::move(vals)) {} + std::vector vals; +}; + +at::Tensor take_an_instance(const c10::intrusive_ptr& instance) { + return torch::zeros({instance->vals.back(), 4}); +} + +struct ElementwiseInterpreter : torch::CustomClassHolder { + using InstructionType = std::tuple< + std::string /*op*/, + std::vector /*inputs*/, + std::string /*output*/>; + + ElementwiseInterpreter() {} + + // Load a list of instructions into the interpreter. As specified above, + // instructions specify the operation (currently support "add" and "mul"), + // the names of the input values, and the name of the single output value + // from this instruction + void setInstructions(std::vector instructions) { + instructions_ = std::move(instructions); + } + + // Add a constant. The interpreter maintains a set of constants across + // calls. They are keyed by name, and constants can be referenced in + // Instructions by the name specified + void addConstant(const std::string& name, at::Tensor value) { + constants_.insert_or_assign(name, std::move(value)); + } + + // Set the string names for the positional inputs to the function this + // interpreter represents. When invoked, the interpreter will assign + // the positional inputs to the names in the corresponding position in + // input_names. + void setInputNames(std::vector input_names) { + input_names_ = std::move(input_names); + } + + // Specify the output name for the function this interpreter represents. This + // should match the "output" field of one of the instructions in the + // instruction list, typically the last instruction. + void setOutputName(std::string output_name) { + output_name_ = std::move(output_name); + } + + // Invoke this interpreter. This takes a list of positional inputs and returns + // a single output. Currently, inputs and outputs must all be Tensors. + at::Tensor __call__(std::vector inputs) { + // Environment to hold local variables + std::unordered_map environment; + + // Load inputs according to the specified names + if (inputs.size() != input_names_.size()) { + std::stringstream err; + err << "Expected " << input_names_.size() << " inputs, but got " + << inputs.size() << "!"; + throw std::runtime_error(err.str()); + } + for (size_t i = 0; i < inputs.size(); ++i) { + environment[input_names_[i]] = inputs[i]; + } + + for (InstructionType& instr : instructions_) { + // Retrieve all input values for this op + std::vector inputs; + for (const auto& input_name : std::get<1>(instr)) { + // Operator output values shadow constants. + // Imagine all constants are defined in statements at the beginning + // of a function (a la K&R C). Any definition of an output value must + // necessarily come after constant definition in textual order. Thus, + // We look up values in the environment first then the constant table + // second to implement this shadowing behavior + if (environment.find(input_name) != environment.end()) { + inputs.push_back(environment.at(input_name)); + } else if (constants_.find(input_name) != constants_.end()) { + inputs.push_back(constants_.at(input_name)); + } else { + std::stringstream err; + err << "Instruction referenced unknown value " << input_name << "!"; + throw std::runtime_error(err.str()); + } + } + + // Run the specified operation + at::Tensor result; + const auto& op = std::get<0>(instr); + if (op == "add") { + if (inputs.size() != 2) { + throw std::runtime_error("Unexpected number of inputs for add op!"); + } + result = inputs[0] + inputs[1]; + } else if (op == "mul") { + if (inputs.size() != 2) { + throw std::runtime_error("Unexpected number of inputs for mul op!"); + } + result = inputs[0] * inputs[1]; + } else { + std::stringstream err; + err << "Unknown operator " << op << "!"; + throw std::runtime_error(err.str()); + } + + // Write back result into environment + const auto& output_name = std::get<2>(instr); + environment[output_name] = std::move(result); + } + + if (!output_name_) { + throw std::runtime_error("Output name not specififed!"); + } + + return environment.at(*output_name_); + } + + // Ser/De infrastructure. See + // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes + // for more info. + + // This is the type we will use to marshall information on disk during + // ser/de. It is a simple tuple composed of primitive types and simple + // collection types like vector, optional, and dict. + using SerializationType = std::tuple< + std::vector /*input_names_*/, + c10::optional /*output_name_*/, + c10::Dict /*constants_*/, + std::vector /*instructions_*/ + >; + + // This function yields the SerializationType instance for `this`. + SerializationType __getstate__() const { + return SerializationType{ + input_names_, output_name_, constants_, instructions_}; + } + + // This function will create an instance of `ElementwiseInterpreter` given + // an instance of `SerializationType`. + static c10::intrusive_ptr __setstate__( + SerializationType state) { + auto instance = c10::make_intrusive(); + std::tie( + instance->input_names_, + instance->output_name_, + instance->constants_, + instance->instructions_) = std::move(state); + return instance; + } + + // Class members + std::vector input_names_; + c10::optional output_name_; + c10::Dict constants_; + std::vector instructions_; +}; + +TORCH_LIBRARY(_TorchScriptTesting, m) { + m.class_("_Foo") + .def(torch::init()) + // .def(torch::init<>()) + .def("info", &Foo::info) + .def("increment", &Foo::increment) + .def("add", &Foo::add) + .def("combine", &Foo::combine); + + m.class_("_NoInit").def( + "get_x", [](const c10::intrusive_ptr& self) { return self->x; }); + + m.class_>("_StackString") + .def(torch::init>()) + .def("push", &MyStackClass::push) + .def("pop", &MyStackClass::pop) + .def("clone", &MyStackClass::clone) + .def("merge", &MyStackClass::merge) + .def_pickle( + [](const c10::intrusive_ptr>& self) { + return self->stack_; + }, + [](std::vector state) { // __setstate__ + return c10::make_intrusive>( + std::vector{"i", "was", "deserialized"}); + }) + .def("return_a_tuple", &MyStackClass::return_a_tuple) + .def( + "top", + [](const c10::intrusive_ptr>& self) + -> std::string { return self->stack_.back(); }) + .def( + "__str__", + [](const c10::intrusive_ptr>& self) { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < self->stack_.size(); ++i) { + ss << self->stack_[i]; + if (i != self->stack_.size() - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + }); + // clang-format off + // The following will fail with a static assert telling you you have to + // take an intrusive_ptr as the first argument. + // .def("foo", [](int64_t a) -> int64_t{ return 3;}); + // clang-format on + + m.class_("_PickleTester") + .def(torch::init>()) + .def_pickle( + [](c10::intrusive_ptr self) { // __getstate__ + return std::vector{1, 3, 3, 7}; + }, + [](std::vector state) { // __setstate__ + return c10::make_intrusive(std::move(state)); + }) + .def( + "top", + [](const c10::intrusive_ptr& self) { + return self->vals.back(); + }) + .def("pop", [](const c10::intrusive_ptr& self) { + auto val = self->vals.back(); + self->vals.pop_back(); + return val; + }); + + m.def( + "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y", + take_an_instance); + // test that schema inference is ok too + m.def("take_an_instance_inferred", take_an_instance); + + m.class_("_ElementwiseInterpreter") + .def(torch::init<>()) + .def("set_instructions", &ElementwiseInterpreter::setInstructions) + .def("add_constant", &ElementwiseInterpreter::addConstant) + .def("set_input_names", &ElementwiseInterpreter::setInputNames) + .def("set_output_name", &ElementwiseInterpreter::setOutputName) + .def("__call__", &ElementwiseInterpreter::__call__) + .def_pickle( + /* __getstate__ */ + [](const c10::intrusive_ptr& self) { + return self->__getstate__(); + }, + /* __setstate__ */ + [](ElementwiseInterpreter::SerializationType state) { + return ElementwiseInterpreter::__setstate__(std::move(state)); + }); +} + +} // namespace diff --git a/test/cpp/jit/test_custom_class_registrations.h b/test/cpp/jit/test_custom_class_registrations.h new file mode 100644 index 000000000000..4e6b7bd43883 --- /dev/null +++ b/test/cpp/jit/test_custom_class_registrations.h @@ -0,0 +1,36 @@ +#include +#include + +namespace torch { +namespace jit { + +template +struct MyStackClass : torch::CustomClassHolder { + std::vector stack_; + MyStackClass(std::vector init) : stack_(init.begin(), init.end()) {} + + void push(T x) { + stack_.push_back(x); + } + T pop() { + auto val = stack_.back(); + stack_.pop_back(); + return val; + } + + c10::intrusive_ptr clone() const { + return c10::make_intrusive(stack_); + } + + void merge(const c10::intrusive_ptr& c) { + for (auto& elem : c->stack_) { + push(elem); + } + } + + std::tuple return_a_tuple() const { + return std::make_tuple(1337.0f, 123); + } +}; +} // namespace jit +} // namespace torch diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp index 529b36385bd4..d3f61268e8f1 100644 --- a/test/cpp/jit/test_custom_operators.cpp +++ b/test/cpp/jit/test_custom_operators.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/alias_analysis.h" @@ -11,134 +12,135 @@ namespace torch { namespace jit { -void testCustomOperators() { - { - torch::RegisterOperators reg( - "foo::bar", [](double a, at::Tensor b) { return a + b; }); - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, InferredSchema) { + torch::RegisterOperators reg( + "foo::bar", [](double a, at::Tensor b) { return a + b; }); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); + ASSERT_EQ(ops.size(), 1); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar"); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar"); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - torch::RegisterOperators reg( - "foo::bar_with_schema(float a, Tensor b) -> Tensor", - [](double a, at::Tensor b) { return a + b; }); + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = - getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ExplicitSchema) { + torch::RegisterOperators reg( + "foo::bar_with_schema(float a, Tensor b) -> Tensor", + [](double a, at::Tensor b) { return a + b; }); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); + auto& ops = + getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - // Check that lists work well. - torch::RegisterOperators reg( - "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", - [](torch::List ints, - torch::List floats, - torch::List tensors) { return floats; }); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists"); - - ASSERT_EQ(op->schema().arguments().size(), 3); - ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); - ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); - ASSERT_TRUE( - op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); - ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); - - Stack stack; - push(stack, c10::List({1, 2})); - push(stack, c10::List({1.0, 2.0})); - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); - - ASSERT_EQ(output.size(), 2); - ASSERT_EQ(output.get(0), 1.0); - ASSERT_EQ(output.get(1), 2.0); - } - { - torch::RegisterOperators reg( - "foo::lists2(Tensor[] tensors) -> Tensor[]", - [](torch::List tensors) { return tensors; }); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ListParameters) { + // Check that lists work well. + torch::RegisterOperators reg( + "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", + [](torch::List ints, + torch::List floats, + torch::List tensors) { return floats; }); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists"); + + ASSERT_EQ(op->schema().arguments().size(), 3); + ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); + ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); + ASSERT_TRUE( + op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); + ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); + + Stack stack; + push(stack, c10::List({1, 2})); + push(stack, c10::List({1.0, 2.0})); + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 2); + ASSERT_EQ(output.get(0), 1.0); + ASSERT_EQ(output.get(1), 2.0); +} - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists2"); +TEST(CustomOperatorTest, ListParameters2) { + torch::RegisterOperators reg( + "foo::lists2(Tensor[] tensors) -> Tensor[]", + [](torch::List tensors) { return tensors; }); - ASSERT_EQ(op->schema().arguments().size(), 1); - ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists2"); - Stack stack; - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); + ASSERT_EQ(op->schema().arguments().size(), 1); + ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); - ASSERT_EQ(output.size(), 1); - ASSERT_TRUE(output.get(0).allclose(at::ones(5))); - } + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + + Stack stack; + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 1); + ASSERT_TRUE(output.get(0).allclose(at::ones(5))); } -void testCustomOperatorAliasing() { +TEST(CustomOperatorTest, Aliasing) { torch::RegisterOperators reg( "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor { a.add_(b); @@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor): } } -void testIValueKWargs() { - const auto text = R"( - def foo(a : int, b : int, c : int = 4): - return a + 2*b + 3*c - )"; - auto cu = compile(text); - auto result = cu->get_function("foo")({1}, {{"b", 3}}); - ASSERT_EQ(result.toInt(), 19); -} - -void testTemplatedOperatorCreator() { - constexpr char op_list[] = "foofoo::bar.template;foo::another"; +static constexpr char op_list[] = "foofoo::bar.template;foo::another"; #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n) \ torch::detail::SelectiveStr(n) - { - // Try to register an op name that does not exist in op_list. - // Expected: the op name is not registered. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); - ASSERT_EQ(ops.size(), 0); - } +TEST(TestCustomOperator, OperatorGeneratorUndeclared) { + // Try to register an op name that does not exist in op_list. + // Expected: the op name is not registered. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); + ASSERT_EQ(ops.size(), 0); +} - { - // The operator should be successfully registered since its name is in the - // whitelist. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foofoo::bar"); - - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); - - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } +TEST(TestCustomOperator, OperatorGeneratorBasic) { + // The operator should be successfully registered since its name is in the + // whitelist. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foofoo::bar"); + + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); } } // namespace jit diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp index 5799913c316a..6f9161d0d9ae 100644 --- a/test/cpp/jit/test_dce.cpp +++ b/test/cpp/jit/test_dce.cpp @@ -1,12 +1,12 @@ -#include -#include +#include +#include #include #include namespace torch { namespace jit { -void testDCE() { +TEST(EliminateDeadCodeTest, Basic) { auto graph = std::make_shared(); // Consider the following loop: diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp index ee0ea060f02f..ef595215b882 100644 --- a/test/cpp/jit/test_fuser.cpp +++ b/test/cpp/jit/test_fuser.cpp @@ -1,4 +1,4 @@ -#include "test/cpp/jit/test_base.h" +#include #include #include "ATen/core/interned_strings.h" @@ -56,28 +56,27 @@ namespace torch { namespace jit { -void testFusion() { - auto testSimple = [&] { - const auto graph_string = R"IR( +TEST(FuserTest, TestSimple_CUDA) { + const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : Tensor = aten::mul(%0, %1) return (%2))IR"; - Graph graph; - torch::jit::parseIR(graph_string, &graph); - - auto a = at::rand({3, 4}, at::kCUDA); - auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); - auto o = at::zeros({3, 4}, at::kCUDA); - auto outputs = debugLaunchGraph(graph, {a, b}); - ASSERT_EQ(outputs.size(), 1); - auto o2 = a * b; - float max_diff = (o2 - outputs[0]).abs().max().item(); - // std::cout << "max diff: " << max_diff << "\n"; - ASSERT_EQ(max_diff, 0); - }; - testSimple(); + Graph graph; + torch::jit::parseIR(graph_string, &graph); + + auto a = at::rand({3, 4}, at::kCUDA); + auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); + auto o = at::zeros({3, 4}, at::kCUDA); + auto outputs = debugLaunchGraph(graph, {a, b}); + ASSERT_EQ(outputs.size(), 1); + auto o2 = a * b; + float max_diff = (o2 - outputs[0]).abs().max().item(); + // std::cout << "max diff: " << max_diff << "\n"; + ASSERT_EQ(max_diff, 0); +} +TEST(FuserTest, TestOne_CUDA) { auto testOne = [&](int ti, int tj) { const auto graph_string = R"IR( graph(%0 : Tensor, @@ -132,7 +131,9 @@ void testFusion() { testOne(0, 1); testOne(1, 2); testOne(0, 2); +} +TEST(FuserTest, FusedConcat_CUDA) { const auto graph_string0 = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -175,7 +176,7 @@ void testFusion() { }; } -void testFusionAliasing() { +TEST(FuserTest, FusionAliasing) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -200,7 +201,7 @@ void testFusionAliasing() { ->run(*g); } -void testRegisterFusionCachesKernel() { +TEST(FuserTest, KernelCaching) { // Constructs two functionally equivalent graphs const auto graph0_string = R"IR( graph(%0 : Float(2, 3, 4), diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 953d1bf42fc0..92baba1168da 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -2225,5 +2225,15 @@ void testProfilerDisableInCallback() { t.join(); } +void testIValueKWargs() { + const auto text = R"( + def foo(a : int, b : int, c : int = 4): + return a + 2*b + 3*c + )"; + auto cu = compile(text); + auto result = cu->get_function("foo")({1}, {{"b", 3}}); + ASSERT_EQ(result.toInt(), 19); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 45d7f48b1f8a..186aaaec2bba 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -9,38 +9,26 @@ namespace torch { namespace jit { #define TH_FORALL_TESTS(_) \ - _(ADFormulas) \ _(Attributes) \ _(Blocks) \ _(CallStack) \ _(CallStackCaching) \ - _(CodeTemplate) \ _(ControlFlow) \ - _(CreateAutodiffSubgraphs) \ - _(CustomOperators) \ - _(CustomOperatorAliasing) \ - _(TemplatedOperatorCreator) \ _(IValueKWargs) \ _(CustomFusion) \ _(SchemaMatching) \ - _(Differentiate) \ - _(DifferentiateWithRequiresGrad) \ _(FromQualString) \ _(InternedStrings) \ _(PassManagement) \ _(Proto) \ - _(RegisterFusionCachesKernel) \ _(SchemaParser) \ _(TopologicalIndex) \ _(SubgraphUtils) \ _(SubgraphUtilsVmap) \ _(IRParser) \ - _(ConstantPooling) \ - _(CleanUpPasses) \ _(THNNConv) \ _(ATenNativeBatchNorm) \ _(NoneSchemaMatch) \ - _(ClassParser) \ _(UnifyTypes) \ _(Profiler) \ _(FallbackGraphs) \ @@ -61,15 +49,10 @@ namespace jit { _(ModuleDeepcopyAliasing) \ _(ModuleDefine) \ _(QualifiedName) \ - _(ClassImport) \ - _(ScriptObject) \ _(ExtraFilesHookPreference) \ _(SaveExtraFilesHook) \ _(TypeTags) \ - _(DCE) \ _(CustomFusionNestedBlocks) \ - _(ClassDerive) \ - _(SaveLoadTorchbind) \ _(ModuleInterfaceSerialization) \ _(ModuleCloneWithModuleInterface) \ _(ClassTypeAddRemoveAttr) \ @@ -100,7 +83,6 @@ namespace jit { _(LiteInterpreterHierarchyModuleInfo) \ _(LiteInterpreterDuplicatedClassTypeModuleInfo) \ _(LiteInterpreterEval) \ - _(TorchbindIValueAPI) \ _(LiteInterpreterDict) \ _(LiteInterpreterFindAndRunMethod) \ _(LiteInterpreterFindWrongMethodName) \ @@ -109,12 +91,10 @@ namespace jit { _(MobileSaveLoadParameters) \ _(MobileSaveLoadParametersEmpty) \ _(LiteSGD) \ - _(LiteSequentialSampler) \ - _(FusionAliasing) + _(LiteSequentialSampler) #if defined(USE_CUDA) #define TH_FORALL_TESTS_CUDA(_) \ - _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ @@ -219,7 +199,6 @@ namespace jit { _(GPU_FusionThreadPredicate) #else #define TH_FORALL_TESTS_CUDA(_) \ - _(Fusion) \ _(GraphExecutor) \ _(ModuleConversion) \ _(Interp) \ From dc67b47bc9d53dbeb898a4d920b0225ac73629ec Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Thu, 24 Sep 2020 02:38:00 -0700 Subject: [PATCH 085/449] Deprecate old fft functions (#44876) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44876 Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D23866715 Pulled By: mruberry fbshipit-source-id: 73305eb02f92cbd1ef7d175419529d19358fedda --- aten/src/ATen/native/SpectralOps.cpp | 16 ++++++++++++++++ docs/source/fft.rst | 2 ++ torch/_torch_docs.py | 26 ++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 120ef9f73042..1e9c1bce67d3 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -561,6 +561,10 @@ void _cufft_clear_plan_cache(int64_t device_index) { } Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + TORCH_WARN_ONCE( + "The function torch.fft is deprecated and will be removed in PyTorch 1.8. " + "Use the new torch.fft module functions, instead, by importing torch.fft " + "and calling torch.fft.fft or torch.fft.fftn."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ true, /* inverse */ false, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, @@ -568,6 +572,10 @@ Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) } Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + TORCH_WARN_ONCE( + "The function torch.ifft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.ifft or torch.fft.ifftn."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ true, /* inverse */ true, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, @@ -576,6 +584,10 @@ Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, const bool onesided) { + TORCH_WARN_ONCE( + "The function torch.rfft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.fft or torch.fft.rfft."); return _fft(self, signal_ndim, /* complex_input */ false, /* complex_output */ true, /* inverse */ false, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, @@ -584,6 +596,10 @@ Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, const bool onesided, IntArrayRef signal_sizes) { + TORCH_WARN_ONCE( + "The function torch.irfft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.ifft or torch.fft.irfft."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ false, /* inverse */ true, signal_sizes, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, diff --git a/docs/source/fft.rst b/docs/source/fft.rst index a732f3e5c652..ab50bd271d32 100644 --- a/docs/source/fft.rst +++ b/docs/source/fft.rst @@ -1,6 +1,8 @@ .. role:: hidden :class: hidden-section +.. _torch-fft-module: + torch.fft ========= diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 5a3b2339fde5..32806259df35 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -8514,6 +8514,12 @@ def merge_dicts(*dicts): The inverse of this function is :func:`~torch.ifft`. +.. deprecated:: 1.7.0 + The function :func:`torch.fft` is deprecated and will be removed in + PyTorch 1.8. Use the new :ref:`torch.fft ` module + functions, instead, by importing :ref:`torch.fft ` and + calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`. + .. note:: For CUDA tensors, an LRU cache is used for cuFFT plans to speed up repeatedly running FFT methods on tensors of same geometry with same @@ -8617,6 +8623,12 @@ def merge_dicts(*dicts): The inverse of this function is :func:`~torch.fft`. +.. deprecated:: 1.7.0 + The function :func:`torch.ifft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`. + .. note:: For CUDA tensors, an LRU cache is used for cuFFT plans to speed up repeatedly running FFT methods on tensors of same geometry with same @@ -8705,6 +8717,13 @@ def merge_dicts(*dicts): The inverse of this function is :func:`~torch.irfft`. +.. deprecated:: 1.7.0 + The function :func:`torch.rfft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.rfft` for one-sided output, or + :func:`torch.fft.fft` for two-sided output. + .. note:: For CUDA tensors, an LRU cache is used for cuFFT plans to speed up repeatedly running FFT methods on tensors of same geometry with same @@ -8777,6 +8796,13 @@ def merge_dicts(*dicts): The inverse of this function is :func:`~torch.rfft`. +.. deprecated:: 1.7.0 + The function :func:`torch.irfft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.irfft` for one-sided input, or + :func:`torch.fft.ifft` for two-sided input. + .. warning:: Generally speaking, input to this function should contain values following conjugate symmetry. Note that even if :attr:`onesided` is From bea7901e387011248cf00e083af71dd92168c211 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Thu, 24 Sep 2020 08:20:06 -0700 Subject: [PATCH 086/449] Enable torch.tensor typechecks (#45077) Summary: this fixes https://github.com/pytorch/pytorch/issues/42983. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45077 Reviewed By: ezyang Differential Revision: D23842493 Pulled By: walterddr fbshipit-source-id: 1c516a5ff351743a187d00cba7ed0be11678edf1 --- mypy.ini | 3 -- tools/pyi/gen_pyi.py | 11 ++++---- torch/_C/__init__.pyi.in | 7 +++++ torch/tensor.py | 61 ++++++++++++++++++++++++---------------- torch/types.py | 16 +++++++++-- 5 files changed, 63 insertions(+), 35 deletions(-) diff --git a/mypy.ini b/mypy.ini index 07cdbc4dd6fa..a7c82cb69359 100644 --- a/mypy.ini +++ b/mypy.ini @@ -102,9 +102,6 @@ ignore_errors = True [mypy-torch.distributions.*] ignore_errors = True -[mypy-torch.tensor] -ignore_errors = True - [mypy-torch._tensor_str] ignore_errors = True diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 118a3e9b58b7..7079c6750223 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -74,11 +74,7 @@ # Somehow, these are defined in both _C and in functional. Ick! 'broadcast_tensors', # Manually define named tensor type stubs in __init__.pyi.in - 'rename', - 'refine_names', - 'align_to', 'align_tensors', - 'unflatten', 'meshgrid', 'cartesian_prod', 'block_diag', @@ -87,7 +83,6 @@ 'stft', 'istft', 'tensordot', - 'norm', 'split', 'unique_consecutive', 'atleast_1d', @@ -536,6 +531,7 @@ def gen_pyi(declarations_path, out): 'def __init__(self, other: Tensor) -> None: ...', 'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM), ], + 'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."], # clamp has no default values in the Declarations 'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf," " *, out: Optional[Tensor]=None) -> Tensor: ..."], @@ -546,6 +542,7 @@ def gen_pyi(declarations_path, out): 'tolist': ['def tolist(self) -> List: ...'], 'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'], 'element_size': ['def element_size(self) -> _int: ...'], + 'data_ptr': ['def data_ptr(self) -> _int: ...'], 'dim': ['def dim(self) -> _int: ...'], 'nonzero': ['def nonzero(self, *, as_tuple: _bool=...) -> Tensor: ...'], 'numel': ['def numel(self) -> _int: ...'], @@ -576,6 +573,10 @@ def gen_pyi(declarations_path, out): ], 'item': ["def item(self) -> Number: ..."], 'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."], + 'set_': ['def set_(self, storage: Storage, offset: _int, size: _size, stride: _size) -> Tensor: ...', + 'def set_(self, storage: Storage) -> Tensor: ...'], + 'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...', + 'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'], }) for binop in ['mul', 'div', 'true_divide', 'floor_divide']: for inplace in [False, True]: diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 41e0e887f829..2543e724b1e0 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -87,6 +87,9 @@ ${dtype_class_hints} class layout: ... +# Defined in torch/csrc/utils/disable_torch_function.cpp +def DisableTorchFunction(): ... + # Defined in torch/csrc/utils/tensor_layouts.cpp strided : layout = ... sparse_coo : layout = ... @@ -105,6 +108,10 @@ class qscheme: ... # Defined in torch/csrc/utils/tensor_qschemes.cpp per_tensor_affine: qscheme = ... +per_channel_affine: qscheme = ... +per_tensor_symmetric: qscheme = ... +per_channel_symmetric: qscheme = ... +per_channel_affine_float_qparams: qscheme = ... # Defined in torch/csrc/autograd/python_function.cpp class _FunctionBase(object): diff --git a/torch/tensor.py b/torch/tensor.py index 18dccfda7c8b..3eadb4667e87 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -7,6 +7,7 @@ import warnings import weakref from torch._C import _add_docstr +from typing import Any, Dict, Tuple, Union from numbers import Number import functools from typing import Optional @@ -53,6 +54,8 @@ def __deepcopy__(self, memo): else: new_storage = self.storage().__deepcopy__(memo) if self.is_quantized: + # quantizer_params can be different type based on torch attribute + quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[torch.qscheme, Tensor, Tensor, int]] if self.qscheme() == torch.per_tensor_affine: quantizer_params = self.qscheme(), self.q_scale(), self.q_zero_point() elif self.qscheme() in (torch.per_channel_affine, torch.per_channel_affine_float_qparams): @@ -85,6 +88,7 @@ def __reduce_ex__(self, proto): check_serializing_named_tensor(self) # See Note [Don't serialize hooks] torch.utils.hooks.warn_if_has_hooks(self) + backward_hooks: Dict[Any, Any] = OrderedDict() # Note: Numpy array is chosen to be the rebuild component for XLA Tensor. # We considered a few options: # 1. CPU tensor can't be used here. @@ -96,12 +100,14 @@ def __reduce_ex__(self, proto): # `tolist()` converts every single element in the tensor into python objects # and serialize them one by one. if self.device.type == 'xla': - args = (self.cpu().numpy(), - self.dtype, - str(self.device), - self.requires_grad) - return (torch._utils._rebuild_xla_tensor, args) + arg_xla = (self.cpu().numpy(), + self.dtype, + str(self.device), + self.requires_grad) + return (torch._utils._rebuild_xla_tensor, arg_xla) if self.is_quantized: + # quantizer_params can be different type based on torch attribute + quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int]] if self.qscheme() == torch.per_tensor_affine: quantizer_params = (torch.per_tensor_affine, self.q_scale(), @@ -116,31 +122,31 @@ def __reduce_ex__(self, proto): self.q_per_channel_axis()) else: raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}") - args = (self.storage(), - self.storage_offset(), - tuple(self.size()), - self.stride(), - quantizer_params, - self.requires_grad, - OrderedDict()) - return (torch._utils._rebuild_qtensor, args) + args_qtensor = (self.storage(), + self.storage_offset(), + tuple(self.size()), + self.stride(), + quantizer_params, + self.requires_grad, + backward_hooks) + return (torch._utils._rebuild_qtensor, args_qtensor) elif self.is_sparse: if self.layout == torch.sparse_coo: - args = (self.layout, - (self._indices(), - self._values(), - self.size())) + args_sparse = (self.layout, + (self._indices(), + self._values(), + self.size())) else: raise NotImplementedError( 'sparse tensor __reduce_ex__ for layout `%s`' % (self.layout)) - return (torch._utils._rebuild_sparse_tensor, args) + return (torch._utils._rebuild_sparse_tensor, args_sparse) else: args = (self.storage(), self.storage_offset(), tuple(self.size()), self.stride(), self.requires_grad, - OrderedDict()) # previously was self._backward_hooks + backward_hooks) # previously was self._backward_hooks return (torch._utils._rebuild_tensor_v2, args) def __setstate__(self, state): @@ -528,7 +534,7 @@ def __format__(self, format_spec): return self.item().__format__(format_spec) return object.__format__(self, format_spec) - def __ipow__(self, other): + def __ipow__(self, other): # type: ignore[misc] relevant_args = (self, other) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and type(other) is not Tensor and has_torch_function(relevant_args): @@ -652,7 +658,8 @@ def __contains__(self, element): if type(self) is not Tensor and has_torch_function(relevant_args): return handle_torch_function(Tensor.__contains__, relevant_args, self, element) if isinstance(element, (torch.Tensor, Number)): - return (element == self).any().item() + # type hint doesn't understand the __contains__ result array + return (element == self).any().item() # type: ignore[union-attr] raise RuntimeError( "Tensor.__contains__ only supports Tensor or scalar, but you passed in a %s." % @@ -669,7 +676,8 @@ def __cuda_array_interface__(self): relevant_args = (self,) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and has_torch_function(relevant_args): - return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self) + # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 + return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self) # type: ignore[attr-defined] # raise AttributeError for unsupported tensors, so that # hasattr(cpu_tensor, "__cuda_array_interface__") is False. @@ -936,7 +944,8 @@ def grad(self): relevant_args = (self,) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and has_torch_function(relevant_args): - return handle_torch_function(Tensor.grad.__get__, relevant_args, self) + # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 + return handle_torch_function(Tensor.grad.__get__, relevant_args, self) # type: ignore[attr-defined] if self.requires_grad and not hasattr(self, "retains_grad") and not self.is_leaf and self._grad is None: warnings.warn("The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad " @@ -951,7 +960,8 @@ def grad(self, new_grad): relevant_args = (self,) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and has_torch_function(relevant_args): - return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad) + # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 + return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad) # type: ignore[attr-defined] self._grad = new_grad @grad.deleter @@ -959,7 +969,8 @@ def grad(self): relevant_args = (self,) from torch.overrides import has_torch_function, handle_torch_function if type(self) is not Tensor and has_torch_function(relevant_args): - return handle_torch_function(Tensor.grad.__delete__, relevant_args, self) + # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 + return handle_torch_function(Tensor.grad.__delete__, relevant_args, self) # type: ignore[attr-defined] del self._grad @classmethod diff --git a/torch/types.py b/torch/types.py index 0e386fc3e134..2aee8cd7ddde 100644 --- a/torch/types.py +++ b/torch/types.py @@ -34,13 +34,25 @@ class Storage(object): _cdata: int + def __deepcopy__(self, memo) -> 'Storage': + ... + + def _new_shared(self, int) -> 'Storage': + ... + def _write_file(self, f: Any, is_real_file: _bool, save_size: _bool) -> None: ... - def size(self) -> int: + def element_size(self) -> int: ... - def _new_shared(self, int) -> 'Storage': + def is_shared(self) -> bool: + ... + + def share_memory_(self) -> 'Storage': + ... + + def size(self) -> int: ... ... From 71d1b5b0e227e407e60c0a3dd6a4caabdcd6c89a Mon Sep 17 00:00:00 2001 From: iurii zdebskyi <47012416+izdeby@users.noreply.github.com> Date: Thu, 24 Sep 2020 08:24:46 -0700 Subject: [PATCH 087/449] Add foreach APIs for binary ops with ScalarList (#44743) Summary: In this PR: 1) Added binary operations with ScalarLists. 2) Fixed _foreach_div(...) bug in native_functions 3) Covered all possible cases with scalars and scalar lists in tests 4) [minor] fixed bug in native_functions by adding "use_c10_dispatcher: full" to all _foreach functions tested via unit tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/44743 Reviewed By: bwasti, malfet Differential Revision: D23753711 Pulled By: izdeby fbshipit-source-id: bf3e8c54bc07867e8f6e82b5d3d35ff8e99b5a0a --- aten/src/ATen/native/ForeachOpsKernels.cpp | 24 + aten/src/ATen/native/ForeachUtils.h | 14 + .../native/cuda/ForeachBinaryOpScalarList.cu | 60 ++ aten/src/ATen/native/cuda/ForeachFunctors.cuh | 115 ++++ .../src/ATen/native/cuda/MultiTensorApply.cuh | 70 +++ aten/src/ATen/native/native_functions.yaml | 97 +++- .../check_backward_compatibility.py | 4 + test/test_foreach.py | 529 ++++++++++++++---- test/test_native_functions.py | 2 +- tools/autograd/gen_python_functions.py | 1 + .../templates/python_torch_functions.cpp | 1 + tools/codegen/model.py | 4 + tools/pyi/gen_pyi.py | 1 + torch/csrc/utils/python_arg_parser.cpp | 22 +- torch/csrc/utils/python_arg_parser.h | 18 +- 15 files changed, 843 insertions(+), 119 deletions(-) create mode 100644 aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 912b5116c4cc..73eb2070c07d 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -24,6 +24,26 @@ std::vector foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor return result; \ } +#define FOREACH_BINARY_OP_SCALARLIST(NAME) \ +void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + \ + for (int i = 0; i < tensors.size(); i++) { \ + tensors[i].NAME##_(scalars[i]); \ + } \ +} \ + \ +std::vector foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + std::vector result; \ + result.reserve(tensors.size()); \ + for (int i = 0; i < tensors.size(); i++) { \ + result.emplace_back(tensors[i].NAME(scalars[i])); \ + } \ + \ + return result; \ +} + #define FOREACH_BINARY_OP_LIST(NAME) \ std::vector foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) { \ check_foreach_api_restrictions(tensors1, tensors2); \ @@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add); FOREACH_BINARY_OP_SCALAR(sub); FOREACH_BINARY_OP_SCALAR(mul); FOREACH_BINARY_OP_SCALAR(div); +FOREACH_BINARY_OP_SCALARLIST(add); +FOREACH_BINARY_OP_SCALARLIST(sub); +FOREACH_BINARY_OP_SCALARLIST(mul); +FOREACH_BINARY_OP_SCALARLIST(div); FOREACH_BINARY_OP_LIST(mul); FOREACH_BINARY_OP_LIST(div); FOREACH_UNARY_OP(sqrt); diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 5a7aced74702..44e6a50297db 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) { } } +void check_foreach_api_restrictions(TensorList tensors, ArrayRef scalars) { + TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); + TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); + TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); +} + // To go via 'fast' path, several conditions must be satisfied // - All tensors must be on the same device // - All tensors must have strided layout @@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) { return true; } +bool can_use_fast_route(TensorList tensors, ArrayRef scalars) { + TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); + TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); + TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); + + return can_use_fast_route(tensors); +} + } }} // at::native diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu new file mode 100644 index 000000000000..684f12732ffc --- /dev/null +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu @@ -0,0 +1,60 @@ +#include +#include +#include + +namespace at { namespace native { + +template class Op> +std::vector foreach_binary_op(TensorList tensors, at::ArrayRef scalars) { + std::vector> tensor_lists; + std::vector vec_res; + for (const auto& t: tensors) { + vec_res.emplace_back(at::native::empty_like(t)); + } + + tensor_lists.emplace_back(tensors.vec()); + tensor_lists.emplace_back(vec_res); + + AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() { + multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor()); + }); + return tensor_lists[1]; +} + +template class Op> +void foreach_binary_op_(TensorList tensors, at::ArrayRef scalars) { + std::vector> tensor_lists; + tensor_lists.emplace_back(tensors.vec()); + + AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() { + multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_()); + }); +} + +#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP) \ +void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors); \ + \ + if (!can_use_fast_route(tensors, scalars)) { \ + return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars); \ + } \ + \ + foreach_binary_op_(tensors, scalars); \ +} \ + \ +std::vector foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors); \ + \ + if (!can_use_fast_route(tensors, scalars)) { \ + return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars); \ + } \ + \ + return foreach_binary_op(tensors, scalars); \ +} + +FOREACH_BINARY_OP_SCALARLIST(add, std::plus); +FOREACH_BINARY_OP_SCALARLIST(sub, std::minus); +FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies); +FOREACH_BINARY_OP_SCALARLIST(div, std::divides); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh index a04d27110c9a..e83eca3dd8e1 100644 --- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh +++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh @@ -118,6 +118,121 @@ struct BinaryOpScalarFunctor { } }; +template class Op> +struct BinaryOpScalarListFunctor_ { + __device__ void operator() ( + int chunk_size, + TensorListScalarListMetadata<1>& tl) { + int tensor_loc = tl.block_to_tensor[blockIdx.x]; + int chunk_idx = tl.block_to_chunk[blockIdx.x]; + int n = tl.sizes[tensor_loc]; + + T* x = (T*)tl.addresses[0][tensor_loc]; + x += chunk_idx * chunk_size; + + double y = tl.scalar_vals[tensor_loc]; + + n -= chunk_idx * chunk_size; + + T r_x[kILP]; + + // to make things simple, we put aligned case in a different code path + if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) { + for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { + // load + load_store(r_x, x, 0 , i_start); +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = Op()(static_cast(r_x[ii]), y); + } + // store + load_store(x, r_x, i_start, 0); + } + } + else { + for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = 0; + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) { + r_x[ii] = x[i]; + } + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = Op()(static_cast(r_x[ii]), y); + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) + x[i] = r_x[ii]; + } + } + } + } +}; + +template class Op> +struct BinaryOpScalarListFunctor { + __device__ void operator() ( + int chunk_size, + TensorListScalarListMetadata<2>& tl) { + int tensor_loc = tl.block_to_tensor[blockIdx.x]; + int chunk_idx = tl.block_to_chunk[blockIdx.x]; + int n = tl.sizes[tensor_loc]; + + T* x = (T*)tl.addresses[0][tensor_loc]; + x += chunk_idx * chunk_size; + + T* out = (T*)tl.addresses[1][tensor_loc]; + out += chunk_idx * chunk_size; + + double y = tl.scalar_vals[tensor_loc]; + + n -= chunk_idx * chunk_size; + + T r_x[kILP]; + + // to make things simple, we put aligned case in a different code path + if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) { + for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { + // load + load_store(r_x, x, 0 , i_start); +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = Op()(static_cast(r_x[ii]), y); + } + // store + load_store(out, r_x, i_start, 0); + } + } + else { + for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = 0; + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) { + r_x[ii] = x[i]; + } + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = Op()(static_cast(r_x[ii]), y); + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) + out[i] = r_x[ii]; + } + } + } + } +}; + template class Op> struct BinaryOpListAlphaFunctor_ { __device__ void operator() ( diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh index f82a0d9a58c8..d162af19fd1b 100644 --- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh +++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh @@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s // TensorListMetadata has to be < 4KB - the limit for kernel launch argument static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; +static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; template struct TensorListMetadata { @@ -35,6 +36,15 @@ template struct TensorListMetadata int block_to_chunk[depth_to_max_blocks[n-1]]; }; +template struct TensorListScalarListMetadata +{ + void* addresses[n][depth_to_max_tensors_scalarlist[n-1]]; + int sizes[depth_to_max_tensors_scalarlist[n-1]]; + double scalar_vals[depth_to_max_tensors_scalarlist[n-1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n-1]]; + int block_to_chunk[depth_to_max_blocks[n-1]]; +}; + template C10_LAUNCH_BOUNDS_1(kBlockSize) __global__ void @@ -49,11 +59,71 @@ multi_tensor_apply_kernel( template void multi_tensor_apply( std::vector>& tensor_lists, + at::ArrayRef scalars, T callable, ArgTypes... args) { TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); + size_t n_tensors = tensor_lists[0].size(); + TensorListScalarListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for(size_t t = 0; t < n_tensors; t++) { + + tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t]; + + tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); + } + loc_tensor_info++; + + int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize; + for (int chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] && + chunk == chunks - 1); + bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]); + bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1); + + if (tensors_full || blocks_full || last_chunk) { + multi_tensor_apply_kernel<<>>( + tensorListMeta, + callable, + args...); + + AT_CUDA_CHECK(cudaGetLastError()); + + // Reset. + loc_block_info = 0; + if(chunk == chunks - 1) { + loc_tensor_info = 0; + } + else { + tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1]; + tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1]; + for(int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1]; + } + loc_tensor_info = 1; + } + } + } + } + } + +template +void multi_tensor_apply( + std::vector>& tensor_lists, + T callable, + ArgTypes... args) { + TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); + const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); size_t n_tensors = tensor_lists[0].size(); TensorListMetadata tensorListMeta; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index f5bbb263ed9c..8068bc1721df 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6187,6 +6187,7 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6194,6 +6195,7 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda_ - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6201,6 +6203,7 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6208,6 +6211,7 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6215,6 +6219,7 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6222,6 +6227,7 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6229,34 +6235,39 @@ CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ -- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] +- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda -- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ -- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] +- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda -- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6264,6 +6275,7 @@ CUDA: foreach_tensor_sub_list_kernel_cuda_ - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6271,13 +6283,15 @@ CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ -- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[] +- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6285,13 +6299,79 @@ CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ +- func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda + +- func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + +- func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + +- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + +- func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda + +- func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + +- func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + +- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + - func: _foreach_exp(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6299,6 +6379,7 @@ CUDA: foreach_tensor_exp_cuda - func: _foreach_exp_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6306,6 +6387,7 @@ CUDA: foreach_tensor_exp_cuda_ - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6313,6 +6395,7 @@ CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6320,6 +6403,7 @@ CUDA: foreach_tensor_sqrt_cuda_ - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6327,6 +6411,7 @@ CUDA: foreach_tensor_addcdiv_cuda_ - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6334,6 +6419,7 @@ CUDA: foreach_tensor_addcmul_cuda_ - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6341,6 +6427,7 @@ CUDA: foreach_tensor_addcdiv_cuda - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 739a4de51951..4303fc563cfc 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -99,6 +99,10 @@ ("preprocess", datetime.date(2020, 10, 1)), ("compile", datetime.date(2020, 10, 1)), ("execute", datetime.date(2020, 10, 1)), + ("aten::_foreach_add", datetime.date(2020, 10, 1)), + ("aten::_foreach_sub_", datetime.date(2020, 10, 1)), + ("aten::_foreach_div", datetime.date(2020, 10, 1)), + ("aten::_foreach_sub", datetime.date(2020, 10, 1)), ] diff --git a/test/test_foreach.py b/test/test_foreach.py index 8369ba5b9be5..85d79096b2ad 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -4,21 +4,30 @@ from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm class TestForeach(TestCase): - bin_ops = [ + foreach_bin_ops = [ torch._foreach_add, - torch._foreach_add_, torch._foreach_sub, - torch._foreach_sub_, torch._foreach_mul, - torch._foreach_mul_, torch._foreach_div, + ] + + foreach_bin_ops_ = [ + torch._foreach_add_, + torch._foreach_sub_, + torch._foreach_mul_, torch._foreach_div_, ] + torch_bin_ops = [ + torch.add, + torch.sub, + torch.mul, + torch.div, + ] + def _get_test_data(self, device, dtype, N): if dtype in [torch.bfloat16, torch.bool, torch.float16]: tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)] - elif dtype in torch.testing.get_all_int_dtypes(): tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)] else: @@ -26,36 +35,39 @@ def _get_test_data(self, device, dtype, N): return tensors - def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - - expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)] - res = foreach_op(tensors1, tensors2) - foreach_op_(tensors1, tensors2) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, expected) - - def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors1 = self._get_test_data(device, dtype, N) - expected = [torch_op(tensors1[i]) for i in range(N)] - res = foreach_op(tensors1) - foreach_op_(tensors1) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, expected) - - def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors = self._get_test_data(device, dtype, N) - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - value = 2 - - expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)] - - res = foreach_op(tensors, tensors1, tensors2, value) - foreach_op_(tensors, tensors1, tensors2, value) - self.assertEqual(res, tensors) - self.assertEqual(tensors, expected) + def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + + expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)] + res = foreach_op(tensors1, tensors2) + foreach_op_(tensors1, tensors2) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, res) + + def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + expected = [torch_op(tensors1[i]) for i in range(N)] + res = foreach_op(tensors1) + foreach_op_(tensors1) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, expected) + + def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors = self._get_test_data(device, dtype, N) + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + value = 2 + + expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)] + + res = foreach_op(tensors, tensors1, tensors2, value) + foreach_op_(tensors, tensors1, tensors2, value) + self.assertEqual(res, tensors) + self.assertEqual(tensors, expected) def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): tensors1 = self._get_test_data(device, dtype, N) @@ -63,8 +75,8 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_ alpha = 2 expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)] - res = foreach_op(tensors1, tensors2, alpha) - foreach_op_(tensors1, tensors2, alpha) + res = foreach_op(tensors1, tensors2, alpha=alpha) + foreach_op_(tensors1, tensors2, alpha=alpha) self.assertEqual(res, tensors1) if dtype == torch.bool: @@ -88,7 +100,7 @@ def test_exp(self, device, dtype): @skipCUDAIfRocm @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) def test_addcmul(self, device, dtype): - if device == 'cpu': + if self.device_type == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcmul, @@ -105,7 +117,7 @@ def test_addcdiv(self, device, dtype): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv) return - if device == 'cpu': + if self.device_type == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, @@ -118,83 +130,372 @@ def test_addcdiv(self, device, dtype): # @dtypes(*torch.testing.get_all_dtypes()) def test_int_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - int_scalar = 1 - - # bool tensor + 1 will result in int64 tensor - if dtype == torch.bool: - expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)] - else: - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, int_scalar) - self.assertEqual(res, expected) - - if dtype in [torch.bool]: - with self.assertRaisesRegex(RuntimeError, - "result type Long can't be cast to the desired output type Bool"): - torch._foreach_add_(tensors, int_scalar) - else: - torch._foreach_add_(tensors, int_scalar) - self.assertEqual(res, tensors) + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3 + expected = [torch_bin_op(t, scalar) for t in tensors] + + res = foreach_bin_op(tensors, scalar) + + if dtype == torch.bool: + self.assertEqual(res, expected) + + with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + + if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, + "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + # TODO[type promotion]: Fix once type promotion is enabled. + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + self.assertEqual(res, [e.to(dtype) for e in expected]) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, expected) + + # TODO[Fix scalar list]: + # We need to update codegen to correctly handle function overloads with float[] and int[]. + # As optimizers work with float tensors, the result will always be torch.float32 for now. + # Current schema is using 'float[]' as scalar list type. + @dtypes(*torch.testing.get_all_dtypes()) + def test_int_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [1 for _ in range(N)] + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + res = foreach_bin_op(tensors, scalars) + + if dtype == torch.bool: + self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) + + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + if dtype in torch.testing.integral_types(): + if self.device_type == 'cpu': + self.assertEqual(res, [e.to(torch.float32) for e in expected]) + else: + # TODO[type promotion]: Fix once type promotion is enabled. + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types() and self.device_type == 'cpu': + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + else: + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_float_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - float_scalar = 1. + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3.3 + expected = [torch_bin_op(t, scalar) for t in tensors] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + res = foreach_bin_op(tensors, scalar) + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, expected) - # float scalar + integral tensor will result in float tensor - if dtype in [torch.uint8, torch.int8, torch.int16, - torch.int32, torch.int64, torch.bool]: - expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)] - else: - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, float_scalar) - self.assertEqual(res, expected) - - if dtype in [torch.uint8, torch.int8, torch.int16, - torch.int32, torch.int64, torch.bool]: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar)) - else: - torch._foreach_add_(tensors, float_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_float_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [1.1 for _ in range(N)] + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + res = foreach_bin_op(tensors, scalars) + + if dtype == torch.bool: + # see TODO[Fix scalar list] + self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) + + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + # see TODO[Fix scalar list] + self.assertEqual(res, [e.to(dtype) for e in expected]) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(tensors, res) + return + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types() and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + foreach_bin_op_(tensors, scalars) + self.assertEqual(tensors, expected) @dtypes(*torch.testing.get_all_dtypes()) def test_complex_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - complex_scalar = 3 + 5j - - # bool tensor + 1 will result in int64 tensor - expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)] - - if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0': - # value cannot be converted to dtype without overflow: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) - self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar)) - return - - res = torch._foreach_add(tensors, complex_scalar) - self.assertEqual(res, expected) + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3 + 5j + expected = [torch_bin_op(t, scalar) for t in tensors] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \ + self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): + foreach_bin_op(tensors, scalar) + return + + res = foreach_bin_op(tensors, scalar) + self.assertEqual(res, expected) + + if dtype not in [torch.complex64, torch.complex128]: + with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(res, tensors) - if dtype not in [torch.complex64, torch.complex128]: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) - else: - torch._foreach_add_(tensors, complex_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_complex_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [3 + 5j for _ in range(N)] + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): + res = foreach_bin_op(tensors, scalars) + + with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): + foreach_bin_op_(tensors, scalars) @dtypes(*torch.testing.get_all_dtypes()) def test_bool_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - bool_scalar = True + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = True + + if dtype == torch.bool: + expected = [torch_bin_op(t, scalar) for t in tensors] + res = foreach_bin_op(tensors, scalar) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + return + + if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): + res = foreach_bin_op(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): + foreach_bin_op_(tensors, scalar) + elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda': + res = foreach_bin_op(tensors, scalar) + self.assertEqual(res, foreach_bin_op(tensors, 1)) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + else: + expected = [torch_bin_op(t, scalar) for t in tensors] + res = foreach_bin_op(tensors, scalar) + + # TODO[type promotion]: Fix once type promotion is enabled. + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + if foreach_bin_op == torch._foreach_div and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): + foreach_bin_op_(tensors, scalar) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, expected) - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, bool_scalar) - self.assertEqual(res, expected) - - torch._foreach_add_(tensors, bool_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_bool_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [True for _ in range(N)] + + if dtype == torch.bool: + if self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + return + else: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): + foreach_bin_op(tensors, scalars) + else: + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"): + foreach_bin_op_(tensors, scalars) + + res = foreach_bin_op(tensors, scalars) + for r in res: + self.assertTrue(r.dtype == torch.float32) + else: + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + if foreach_bin_op == torch._foreach_sub: + if self.device_type == "cpu": + # see TODO[Fix scalar list] + res = foreach_bin_op(tensors, scalars) + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [r.to(torch.float32) for r in foreach_bin_op(tensors, 1)]) + + with self.assertRaisesRegex(RuntimeError, "esult type Float can't be cast to the "): + foreach_bin_op_(tensors, scalars) + else: + self.assertEqual(res, foreach_bin_op(tensors, 1)) + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) + else: + # see TODO[Fix scalar list] + res = foreach_bin_op(tensors, scalars) + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [r.to(dtype) for r in foreach_bin_op(tensors, 1)]) + else: + self.assertEqual(res, foreach_bin_op(tensors, 1)) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) + else: + if self.device_type == "cpu": + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + res = foreach_bin_op(tensors, scalars) + + # see TODO[Fix scalar list] + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [e.to(torch.float32) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): + foreach_bin_op_(tensors, scalars) + else: + foreach_bin_op_(tensors, scalars) + self.assertEqual(tensors, expected) + else: + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + res = foreach_bin_op(tensors, scalars) + + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_add_with_different_size_tensors(self, device, dtype): @@ -248,9 +549,9 @@ def test_add_list_error_cases(self, device): # One empty list tensors1.append(torch.tensor([1], device=device)) - with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."): + with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."): torch._foreach_add(tensors1, tensors2) - with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."): + with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."): torch._foreach_add_(tensors1, tensors2) # Lists have different amount of tensors @@ -318,13 +619,25 @@ def test_div_list(self, device, dtype): self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489") return - self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div) + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + + if dtype in [torch.bfloat16, torch.bool, torch.float16]: + tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)] + else: + tensors2 = self._get_test_data(device, dtype, N) + + expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)] + res = torch._foreach_div(tensors1, tensors2) + torch._foreach_div_(tensors1, tensors2) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, res) def test_bin_op_list_error_cases(self, device): tensors1 = [] tensors2 = [] - for bin_op in self.bin_ops: + for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_: # Empty lists with self.assertRaises(RuntimeError): bin_op(tensors1, tensors2) diff --git a/test/test_native_functions.py b/test/test_native_functions.py index 869c7aad47fb..e5afc79f037a 100644 --- a/test/test_native_functions.py +++ b/test/test_native_functions.py @@ -58,7 +58,7 @@ def fake_module(values, const): self.do_test_optional_floatlist_with_module(fake_module) def test_optional_floatlist_invalid(self): - with self.assertRaisesRegex(TypeError, "must be .* but found"): + with self.assertRaisesRegex(TypeError, "must be tuple of floats, not list"): FloatListWrapperModule()(torch.zeros(1), ["hi"]) with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"): diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 995dff38030b..8f272de9a5f6 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -281,6 +281,7 @@ def create_python_bindings(python_functions, is_python_method, module): 'c10::optional': 'toBoolOptional', 'c10::optional': 'toDoubleOptional', 'c10::optional>': 'doublelistOptional', + 'ArrayRef': 'doublelist', 'IntArrayRef': 'intlist', 'Scalar': 'scalar', 'ScalarType': 'scalartype', diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 62e9b8dd227f..673af99bce77 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -44,6 +44,7 @@ using at::Generator; using at::TensorList; using at::Dimname; using at::DimnameList; +using at::ArrayRef; using namespace torch::autograd::utils; diff --git a/tools/codegen/model.py b/tools/codegen/model.py index b0c470c91b6a..4ec0dc428b81 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -304,6 +304,10 @@ def __post_init__(self) -> None: # TODO: fixme if str(self.name) not in [ '_amp_non_finite_check_and_unscale_', + '_foreach_add_.ScalarList', + '_foreach_sub_.ScalarList', + '_foreach_mul_.ScalarList', + '_foreach_div_.ScalarList', '_foreach_add_.Scalar', '_foreach_sub_.Scalar', '_foreach_mul_.Scalar', diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 7079c6750223..d24966f9fb52 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -146,6 +146,7 @@ def type_to_python(typename, size=None): 'Dimname': 'Union[str, ellipsis, None]', 'DimnameList': 'Sequence[Union[str, ellipsis, None]]', 'QScheme': '_qscheme', + 'ArrayRef' : 'Sequence[float]' }[typename] return typename diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index e954bef398e9..f9e26af63ada 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -366,6 +366,23 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector return true; } +bool is_float_list(PyObject* obj) { + auto tuple = six::isTuple(obj); + if (!(tuple || PyList_Check(obj))) { + return false; + } + + auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj); + if (size > 0) { + PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0); + if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) { + return false; + } + } + + return true; +} + // argnum is needed for raising the TypeError, it's used in the error message. auto FunctionParameter::check(PyObject* obj, std::vector &overloaded_args, int argnum) -> bool { @@ -420,7 +437,9 @@ auto FunctionParameter::check(PyObject* obj, std::vector &overloaded // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int return size > 0 && THPUtils_checkLong(obj); } - case ParameterType::FLOAT_LIST: return (PyTuple_Check(obj) || PyList_Check(obj)); + case ParameterType::FLOAT_LIST: { + return is_float_list(obj); + } case ParameterType::GENERATOR: return THPGenerator_Check(obj); case ParameterType::BOOL: return PyBool_Check(obj); case ParameterType::STORAGE: return isStorage(obj); @@ -901,6 +920,7 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject* print_error(self, args, kwargs, parsed_args); } + void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) { // NOLINT auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0); std::vector plausible_idxs; diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 78efb6cf2db3..d0e2bdc074ff 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -173,6 +173,8 @@ struct PythonArgs { inline c10::optional toBoolOptional(int i); inline c10::optional toDoubleOptional(int i); inline c10::OptionalArray doublelistOptional(int i); + inline std::vector doublelist(int i); + inline std::vector getDoublelist(int i); inline at::Layout layout(int i); inline at::Layout layoutWithDefault(int i, at::Layout default_layout); inline c10::optional layoutOptional(int i); @@ -369,10 +371,7 @@ inline c10::OptionalArray PythonArgs::intlistOptional(int i) { return intlist(i); } -inline c10::OptionalArray PythonArgs::doublelistOptional(int i) { - if (!args[i]) { - return {}; - } +inline std::vector PythonArgs::getDoublelist(int i) { PyObject* arg = args[i]; auto tuple = PyTuple_Check(arg); auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg); @@ -390,6 +389,17 @@ inline c10::OptionalArray PythonArgs::doublelistOptional(int i) { return res; } +inline c10::OptionalArray PythonArgs::doublelistOptional(int i) { + if (!args[i]) { + return {}; + } + return this->getDoublelist(i); +} + +inline std::vector PythonArgs::doublelist(int i) { + return this->getDoublelist(i); +} + inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) { if (!args[i]) return default_scalartype; return scalartype(i); From bc591d76a10c79f179d0bea016e59096add511a3 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Thu, 24 Sep 2020 08:35:46 -0700 Subject: [PATCH 088/449] add skip_if_rocm to all requires_nccl tests (#45158) Summary: requires_nccl annotation should skip_if_rocm as well Pull Request resolved: https://github.com/pytorch/pytorch/pull/45158 Reviewed By: seemethere Differential Revision: D23879952 Pulled By: walterddr fbshipit-source-id: 818fb31ab75d5f02e77fe3f1367faf748855bee7 --- .../ddp_comm_hooks/test_ddp_hooks.py | 5 ++++ test/distributed/test_c10d.py | 24 +++++++++++++++++++ .../ddp_under_dist_autograd_test.py | 2 ++ 3 files changed, 31 insertions(+) diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py index 2b3d43814c0f..37c8f14af853 100644 --- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py +++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py @@ -14,6 +14,7 @@ MultiProcessTestCase, requires_nccl, skip_if_lt_x_gpu, + skip_if_rocm, ) from torch.testing._internal.common_utils import run_tests @@ -97,6 +98,7 @@ def _run_and_get_grads(self, model): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_allreduce_hook(self): """ This unit test verifies the ``allreduce`` hook registered case gives same result @@ -114,6 +116,7 @@ def test_ddp_comm_hook_allreduce_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_fp16compress_hook(self): """ This unit test verifies the ``fp16 compress`` hook registered case @@ -131,6 +134,7 @@ def test_ddp_comm_hook_fp16compress_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_quantize_per_tensor_hook(self): """ This unit test verifies the ``quantize per tensor`` hook registered case @@ -148,6 +152,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_quantize_per_channel_hook(self): """ This unit test verifies the ``quantize per channel`` hook registered case diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index cfd0930284a5..b2b7d186713c 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -1559,6 +1559,7 @@ def create(num, prefix): TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment", ) +@skip_if_rocm class ProcessGroupNCCLTest(TestCase): MAIN_PROCESS_RANK = 0 @@ -2123,6 +2124,7 @@ def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self): @requires_nccl() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_nccl_backend_2gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:2] devices = [torch.device("cuda:" + str(i)) for i in int_devices] @@ -2130,6 +2132,7 @@ def test_nccl_backend_2gpu_module(self): @requires_nccl() @skip_if_lt_x_gpu(8) + @skip_if_rocm def test_nccl_backend_4gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:4] devices = [torch.device("cuda:" + str(i)) for i in int_devices] @@ -2137,6 +2140,7 @@ def test_nccl_backend_4gpu_module(self): @requires_nccl() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_ddp_multi_device_module_config(self): gpus = gpus_for_rank(self.world_size)[self.rank] @@ -2167,6 +2171,7 @@ def test_ddp_multi_device_module_config(self): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_fp16(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2198,6 +2203,7 @@ def test_fp16(self): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_arbitrary_forward_return_value(self): """ Note: this test can be sped up by only running it on a CPU module @@ -2482,6 +2488,7 @@ def run_and_verify_grad(model): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_multiple_outputs_multiple_backward(self): """ Note: this test can be sped up by only running it on a CPU module @@ -2532,6 +2539,7 @@ def forward(self, x): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_no_grad(self): """ Note: this test can be sped up by only running it on a CPU module @@ -2643,6 +2651,7 @@ def step_model(model, input, target): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync(self): """ Runs _test_accumulate_gradients_no_sync using default inputs @@ -2651,6 +2660,7 @@ def test_accumulate_gradients_no_sync(self): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync_allreduce_hook(self): """ Runs multiple iterations on _test_accumulate_gradients_no_sync @@ -2670,6 +2680,7 @@ def allreduce_hook( @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self): """ Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce @@ -2699,6 +2710,7 @@ def div(fut): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_module(self): # This is NOT the recommended way to implement accumulating grads, but # we would like to make sure DDP does not mess up with the underlying @@ -2840,6 +2852,7 @@ def forward(self, x): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_failure_recovery(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3161,6 +3174,7 @@ def test_ddp_comm_hook_future_passing_gpu_gloo(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_future_passing_gpu_nccl(self): """ This unit test verifies whether the Future object is passed properly using nccl backend. @@ -3178,6 +3192,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_allreduce_hook_nccl(self): """ This unit test verifies whether a DDP communication hook that just calls @@ -3200,6 +3215,7 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future: @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self): """ This unit test verifies whether a DDP communication hook that calls allreduce and then @@ -3591,6 +3607,7 @@ def _run_all_reduce(self, pg): @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_nonblocking(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3642,36 +3659,42 @@ def _test_nccl_errors_blocking(self, func): @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_clean_exit(self): self._test_nccl_errors_blocking(lambda: sys.exit(0)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_nonzero_exit(self): self._test_nccl_errors_blocking(lambda: sys.exit(1)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_abort(self): self._test_nccl_errors_blocking(lambda: os.abort()) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_sigkill(self): self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_sigterm(self): self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_blocking_wait_with_barrier(self): os.environ["NCCL_BLOCKING_WAIT"] = "1" store = c10d.FileStore(self.file_name, self.world_size) @@ -3694,6 +3717,7 @@ def _run_invalid_nccl_blocking_wait_env(self, val): @requires_nccl() @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_invalid_nccl_blocking_wait_env(self): self._run_invalid_nccl_blocking_wait_env('abc') self._run_invalid_nccl_blocking_wait_env('-1') diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py index 75d89c33325a..1b1f755ed4cc 100644 --- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py +++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py @@ -18,6 +18,7 @@ requires_gloo, requires_nccl, skip_if_lt_x_gpu, + skip_if_rocm, ) from torch.testing._internal.dist_utils import dist_init from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import ( @@ -641,6 +642,7 @@ def test_ddp_dist_autograd_local_vs_remote(self): @skip_if_lt_x_gpu(NUM_TRAINERS) @requires_nccl() @dist_init + @skip_if_rocm def test_ddp_dist_autograd_local_vs_remote_gpu(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and From f9ae296a85c9e3835cd8664d18fea9282c205e58 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Thu, 24 Sep 2020 08:42:44 -0700 Subject: [PATCH 089/449] renaming TestDdpCommHook class so it doesn't get picked up as a test by pytest (#44905) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44905 Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D23825308 Pulled By: bdhirsh fbshipit-source-id: 17a07b3bd211850d6ecca793fd9ef3f326ca9274 --- test/distributed/test_c10d.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index b2b7d186713c..d9faee9197a0 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -1937,7 +1937,7 @@ def forward(self, x): return self.p + x -class TestDdpCommHook(nn.Module): +class ModuleForDdpCommHook(nn.Module): def __init__(self): super().__init__() self.t0 = Task() @@ -3110,7 +3110,7 @@ def test_ddp_comm_hook_future_passing_cpu(self): # Test on CPU cpu_model = DistributedDataParallel( - TestDdpCommHook().cpu(), process_group=process_group + ModuleForDdpCommHook().cpu(), process_group=process_group ) # Register DDP Communication Hook @@ -3123,7 +3123,7 @@ def test_ddp_comm_hook_future_passing_cpu(self): def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None): device_id = gpus_for_rank(self.world_size)[self.rank][0] gpu_model = DistributedDataParallel( - TestDdpCommHook().to(device_id), + ModuleForDdpCommHook().to(device_id), device_ids=[device_id], process_group=process_group, ) @@ -3259,7 +3259,7 @@ def test_ddp_invalid_comm_hook_init(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) with self.assertRaisesRegex(TypeError, "Communication hook must be callable."): model._register_comm_hook(state=None, hook=1) @@ -3283,7 +3283,7 @@ def test_ddp_invalid_comm_hook_return_type(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) with self.assertRaisesRegex( ValueError, @@ -3320,7 +3320,7 @@ def test_ddp_comm_hook_register_just_once(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) def dummy_hook(state, bucket): fut = torch.futures.Future() From 5195d727b57c19f1d5e201338a062f4d1d0636c1 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Thu, 24 Sep 2020 09:14:00 -0700 Subject: [PATCH 090/449] adding a test for ddp save()/load() (#44906) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44906 Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D23825386 Pulled By: bdhirsh fbshipit-source-id: 2276e6e030ef9cffd78fc78c2ffe34d60a1e160e --- test/distributed/test_c10d.py | 86 +++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index d9faee9197a0..64e255fce3e6 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -2916,6 +2916,92 @@ def forward(self, x): loss = criterion(output, target) loss.backward() + @requires_nccl() + @skip_if_not_multigpu + def test_save_load_checkpoint(self): + dist.init_process_group( + "gloo", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank + ) + + class TestModel(nn.Module): + def __init__(self): + super(TestModel, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + def train_loop(model, optimizer, iterations): + for _ in range(iterations): + optimizer.zero_grad() + output = model(input) + loss = criterion(output, target) + loss.backward() + optimizer.step() + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + + model_withload = TestModel().float().to(device_id) + model_withoutload = TestModel().float().to(device_id) + + ddp_withload = DistributedDataParallel( + model_withload, + device_ids=[device_id], + ) + ddp_withoutload = DistributedDataParallel( + model_withoutload, + device_ids=[device_id], + ) + + # ensure that both models start with the same set of parameters. By default they are randomized on construction + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + for p in ddp_withoutload.parameters(): + with torch.no_grad(): + p.zero_() + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + + optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001) + optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001) + + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id) + + # run the model for 6 iterations, with a checkpoint in the middle + train_loop(ddp_withload, optimizer_withload, 3) + + # zero out parameters and reload them from the state dict + checkpoint_path = tempfile.gettempdir() + "/model.checkpoint" + if self.rank == 0: + torch.save(ddp_withload.state_dict(), checkpoint_path) + + dist.barrier() + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + map_location = {'cuda:%d' % 0: 'cuda:%d' % self.rank} + ddp_withload.load_state_dict( + torch.load(checkpoint_path, map_location=map_location)) + + train_loop(ddp_withload, optimizer_withload, 3) + + # re-run the model with the same inputs for 6 iterations with no checkpoint + train_loop(ddp_withoutload, optimizer_withoutload, 6) + + for p_withload, p_withoutload in zip(ddp_withload.parameters(), ddp_withoutload.parameters()): + self.assertEqual(p_withload, p_withoutload) + + def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): mult = 2 batch_size = mult * self.world_size From bfdf4323ac3bbd3dcdf4b7ab216347770ebaa3bb Mon Sep 17 00:00:00 2001 From: Shen Li Date: Thu, 24 Sep 2020 09:31:46 -0700 Subject: [PATCH 091/449] Bump up NCCL to 2.7.8 (#45251) Summary: Use latest NCCL Pull Request resolved: https://github.com/pytorch/pytorch/pull/45251 Reviewed By: mingzhe09088 Differential Revision: D23893064 Pulled By: mrshenli fbshipit-source-id: 820dd166039e61a5aa59b4c5bbc615a7b18be8c3 --- third_party/nccl/nccl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl index 195232556936..033d799524fb 160000 --- a/third_party/nccl/nccl +++ b/third_party/nccl/nccl @@ -1 +1 @@ -Subproject commit 195232556936b39b01cc908296e1650b80d4a3e9 +Subproject commit 033d799524fb97629af5ac2f609de367472b2696 From 8507ea22b21842f93a7d17ddfe737f134642375c Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Thu, 24 Sep 2020 09:36:46 -0700 Subject: [PATCH 092/449] replace timer test with a mocked variant (#45173) Summary: I noticed that the recently introduced adaptive_autorange tests occasionally timeout CI, and I've been meaning to improve the Timer tests for a while. This PR allows unit tests to swap the measurement portion of `Timer` with a deterministic mock so we can thoroughly test behavior without having to worry about flaky CI measurements. It also means that the tests can be much more detailed and still finish very quickly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45173 Test Plan: You're lookin' at it. Reviewed By: ezyang Differential Revision: D23873548 Pulled By: robieta fbshipit-source-id: 26113e5cea0cbf46909b9bf5e90c878c29e87e88 --- test/test_utils.py | 152 +++++++++++++++++++++++--- torch/utils/_benchmark/utils/timer.py | 4 +- 2 files changed, 141 insertions(+), 15 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index bf002541bebf..398a10971d0d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import shutil import random import tempfile +import textwrap import unittest import torch import torch.nn as nn @@ -16,6 +17,7 @@ from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS from urllib.error import URLError +import numpy as np # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -621,25 +623,147 @@ def test_timer(self): timer = benchmark_utils.Timer( stmt="torch.ones(())", ) - median = timer.blocked_autorange(min_run_time=0.1).median + median = timer.blocked_autorange(min_run_time=0.01).median self.assertIsInstance(median, float) + # We set a very high threshold to avoid flakiness in CI. + # The internal algorithm is tested in `test_adaptive_timer` + median = timer.adaptive_autorange(threshold=0.5).median + + class _MockTimer: + _seed = 0 + + _timer_noise_level = 0.05 + _timer_cost = 100e-9 # 100 ns + + _function_noise_level = 0.05 + _function_costs = ( + ("pass", 8e-9), + ("cheap_fn()", 4e-6), + ("expensive_fn()", 20e-6), + ) + + def __init__(self, stmt, setup, timer, globals): + self._random_state = np.random.RandomState(seed=self._seed) + self._mean_cost = {k: v for k, v in self._function_costs}[stmt] + + def sample(self, mean, noise_level): + return max(self._random_state.normal(mean, mean * noise_level), 5e-9) + + def timeit(self, number): + return sum([ + # First timer invocation + self.sample(self._timer_cost, self._timer_noise_level), + + # Stmt body + self.sample(self._mean_cost * number, self._function_noise_level), + + # Second timer invocation + self.sample(self._timer_cost, self._timer_noise_level), + ]) + def test_adaptive_timer(self): - # Validate both on different sizes validate against blocked_autorange - # This looks for relative differences btetween orders of magnitude to - # provide a stable/portable test which is somewhat informative. - timer = benchmark_utils.Timer( - stmt="torch.sum(torch.ones((10,10)))", + class MockTimer(benchmark_utils.Timer): + _timer_cls = self._MockTimer + + def assert_reprs_match(measurement, expected): + measurement_repr = re.sub( + "object at 0x[0-9a-fA-F]+>", + "object at 0xXXXXXXXXXXXX>", + repr(measurement) + ) + self.assertEqual(measurement_repr, textwrap.dedent(expected).strip()) + + assert_reprs_match( + MockTimer("pass").blocked_autorange(min_run_time=10), + """ + + pass + Median: 7.98 ns + IQR: 0.52 ns (7.74 to 8.26) + 125 measurements, 10000000 runs per measurement, 1 thread""" ) - small = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0) - timer = benchmark_utils.Timer( - stmt="torch.sum(torch.ones((500,500)))", + + assert_reprs_match( + MockTimer("pass").adaptive_autorange(), + """ + + pass + Median: 7.86 ns + IQR: 0.71 ns (7.63 to 8.34) + 6 measurements, 1000000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("cheap_fn()").blocked_autorange(min_run_time=10), + """ + + cheap_fn() + Median: 3.98 us + IQR: 0.27 us (3.85 to 4.12) + 252 measurements, 10000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("cheap_fn()").adaptive_autorange(), + """ + + cheap_fn() + Median: 4.16 us + IQR: 0.22 us (4.04 to 4.26) + 4 measurements, 1000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("expensive_fn()").blocked_autorange(min_run_time=10), + """ + + expensive_fn() + Median: 19.97 us + IQR: 1.35 us (19.31 to 20.65) + 501 measurements, 1000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("expensive_fn()").adaptive_autorange(), + """ + + expensive_fn() + Median: 20.79 us + IQR: 1.09 us (20.20 to 21.29) + 4 measurements, 1000 runs per measurement, 1 thread""" ) - medium = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0) - blocked_medium = timer.blocked_autorange(min_run_time=0.1) - self.assertLess(small.median, medium.median) - # This acts as a control to compare to a different way to measure the same value. - self.assertLess(small.median, blocked_medium.median) + + class _MockCudaTimer(self._MockTimer): + # torch.cuda.synchronize is much more expensive than + # just timeit.default_timer + _timer_cost = 10e-6 + + _function_costs = ( + self._MockTimer._function_costs[0], + self._MockTimer._function_costs[1], + + # GPU should be faster once there is enough work. + ("expensive_fn()", 5e-6), + ) + + class MockCudaTimer(benchmark_utils.Timer): + _timer_cls = _MockCudaTimer + + configurations = ( + (7.9903966e-09, 376, 1000000, MockTimer("pass")), + (7.8554826e-09, 4, 100000000, MockCudaTimer("pass")), + (3.9930536e-06, 752, 1000, MockTimer("cheap_fn()")), + (3.9441239e-06, 8, 100000, MockCudaTimer("cheap_fn()")), + (1.9994249e-05, 150, 1000, MockTimer("expensive_fn()")), + (4.9301076e-06, 6, 100000, MockCudaTimer("expensive_fn()")), + ) + + for median, repeats, number_per_run, timer_instance in configurations: + measurement = timer_instance.blocked_autorange(min_run_time=3) + self.assertEqual(measurement.median, median) + self.assertEqual(len(measurement.times), repeats) + self.assertEqual(measurement.number_per_run, number_per_run) def test_compare(self): compare = benchmark_utils.Compare([ diff --git a/torch/utils/_benchmark/utils/timer.py b/torch/utils/_benchmark/utils/timer.py index 00260b49f99f..c78db2740c2f 100644 --- a/torch/utils/_benchmark/utils/timer.py +++ b/torch/utils/_benchmark/utils/timer.py @@ -20,6 +20,8 @@ def timer(): class Timer(object): + _timer_cls = timeit.Timer + def __init__( self, stmt="pass", @@ -47,7 +49,7 @@ def __init__( self._description = description self._env = env self._num_threads = num_threads - self._timer = timeit.Timer(stmt=stmt, setup=setup, timer=timer, globals=globals) + self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals) def _construct_measurement(self, number_per_run: int, times: List[float]): return common.Measurement( From 2b38c09f69ace58058ace6d1b3b45725c1281fca Mon Sep 17 00:00:00 2001 From: Raziel Alvarez Guevara Date: Thu, 24 Sep 2020 09:36:53 -0700 Subject: [PATCH 093/449] Moves prim ops from C10 back to JIT (#45144) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45144 Moves prim ops from C10 back to JIT. These were originally moved to C10 from JIT in D19237648 (https://github.com/pytorch/pytorch/commit/f362cd510dcedbf7384d418aad60e0ba963baeb6) ghstack-source-id: 112775781 Test Plan: buck test //caffe2/test/cpp/jit:jit https://pxl.cl/1l22N buck test adsatlas/gavel/lib/ata_processor/tests:ata_processor_test https://pxl.cl/1lBxD Reviewed By: iseeyuan Differential Revision: D23697598 fbshipit-source-id: 36d1eb8c346e9b161ba6af537a218440a9bafd27 --- aten/src/ATen/templates/TypeDefault.cpp | 8 --- test/cpp/jit/test_lite_interpreter.cpp | 26 +++++++++ test/cpp/jit/tests.h | 1 + tools/build_variables.bzl | 8 +-- torch/csrc/jit/runtime/register_prim_ops.cpp | 53 +++++++++++++++++++ .../jit/runtime/register_prim_ops_c10.cpp | 40 -------------- 6 files changed, 82 insertions(+), 54 deletions(-) delete mode 100644 torch/csrc/jit/runtime/register_prim_ops_c10.cpp diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index c1e7c9ac0c64..58c80381d340 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -59,14 +59,6 @@ TORCH_LIBRARY(aten, m) { m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]")); m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str")); - // Integer Ops - // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp - m.def("Int.Tensor(Tensor a) -> int"); - m.def("Int.bool(bool a) -> int"); - m.def("Int.float(float a) -> int"); - m.def("Int.Scalar(Scalar a) -> int"); - m.def("Int.str(str a) -> int"); - // Distributed Ops // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)"); diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index d09048413aec..814654dfc697 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -204,6 +204,32 @@ void testLiteInterpreterPrim() { AT_ASSERT(resi == refi); } +void testLiteInterpreterPrimScalar() { + Module m("m"); + m.define(R"JIT( + def forward(self, x): + return int(x.item()) + )JIT"); + + std::vector inputs; + auto minput = 3.5 * torch::ones({}); + inputs.emplace_back(minput); + auto ref = m.run_method("forward", minput); + + std::stringstream ss; + m._save_for_mobile(ss); + mobile::Module bc = _load_for_mobile(ss); + IValue res; + for (int i = 0; i < 3; ++i) { + auto bcinputs = inputs; + res = bc.get_method("forward")(bcinputs); + } + + auto resi = res.toInt(); + auto refi = ref.toInt(); + AT_ASSERT(resi == refi); +} + void testLiteInterpreterLoadOrigJit() { Module m("m"); m.register_parameter("foo", torch::ones({}), false); diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 186aaaec2bba..0285559fb8fc 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -71,6 +71,7 @@ namespace jit { _(MobileTypeParser) \ _(LiteInterpreterBuiltinFunction) \ _(LiteInterpreterPrim) \ + _(LiteInterpreterPrimScalar) \ _(LiteInterpreterLoadOrigJit) \ _(LiteInterpreterWrongMethodName) \ _(LiteInterpreterParams) \ diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index b1a2967f5dea..174bb858da44 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -91,11 +91,7 @@ core_sources_common = [ "torch/csrc/jit/serialization/unpickler.cpp", ] -jit_sources_common = [ - "torch/csrc/jit/runtime/register_prim_ops_c10.cpp", -] - -libtorch_sources_common = core_sources_common + jit_sources_common +libtorch_sources_common = core_sources_common core_trainer_sources = [ "torch/csrc/autograd/anomaly_mode.cpp", @@ -306,7 +302,7 @@ jit_sources_full = [ "torch/csrc/jit/passes/utils/check_alias_annotation.cpp", ] -libtorch_core_jit_sources = sorted(jit_sources_common + jit_sources_full) +libtorch_core_jit_sources = sorted(jit_sources_full) libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index 98f328a43240..bf2ffa421ee9 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -188,6 +188,59 @@ RegisterOperators reg( push(stack, (bool)d); }, aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"), + [](Stack* stack) { + bool b; + pop(stack, b); + push(stack, static_cast(b)); + }, + aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"), + [](Stack* stack) { + double d; + pop(stack, d); + push(stack, static_cast(d)); + }, + aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"), + [](Stack* stack) { + IValue scalar; + pop(stack, scalar); + if (scalar.isInt()) { + push(stack, std::move(scalar)); + } else { + // toScalar() needed to avoid strict type check in IValue::toInt. + push(stack, static_cast(scalar.toScalar().toInt())); + } + }, + aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"), + [](Stack* stack) { + auto s = pop(stack).toString(); + std::string::size_type sz; + int64_t val = static_cast(c10::stoll(s->string(), &sz)); + if (sz == s->string().size()) { + push(stack, val); + } else { + std::stringstream error_str; + error_str << "invalid literal for int() " + << "with base 10: '" << s->string() << "'"; + throw std::runtime_error(error_str.str()); + } + }, + aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"), [](Stack* stack) { diff --git a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp b/torch/csrc/jit/runtime/register_prim_ops_c10.cpp deleted file mode 100644 index b9e4e23c77b0..000000000000 --- a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include -#include -#include - -using Stack = std::vector; -using at::Scalar; -using at::Tensor; -using c10::IValue; -using torch::jit::drop; -using torch::jit::pack; -using torch::jit::peek; -using torch::jit::pop; -using torch::jit::push; - -// Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cpp -TORCH_LIBRARY_IMPL(aten, CatchAll, m) { - m.impl("Int.Tensor", [](at::Tensor a) { return a.item(); }); - - m.impl("Int.bool", [](bool b) { return static_cast(b); }); - - m.impl("Int.float", [](double d) { return static_cast(d); }); - - m.impl("Int.Scalar", [](Scalar scalar) { - return static_cast(scalar.toInt()); - }); - - m.impl("Int.str", [](const std::string& str) { - std::string::size_type sz; - int64_t val = static_cast(c10::stoll(str, &sz)); - if (sz != str.size()) { - std::stringstream error_str; - error_str << "invalid literal for int() " - << "with base 10: '" << str << "'"; - throw std::runtime_error(error_str.str()); - } - return val; - }); -} From e57a08119bc01b7d06ee6ba8042cc0885ebb6276 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Thu, 24 Sep 2020 09:48:51 -0700 Subject: [PATCH 094/449] Add a warning log when there is high skew of uneven inputs in DDP training (#45238) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45238 Adds a warning when there is much higher than expected amount of discrepancy of inputs across different processes when running with uneven inputs. This is because a skew in the thousands can reduce performance a nontrivial amount as shown in benchmarks, and it was proposed to add this warning as a result. Tested by running the tests so the threshold is hit and observing the output. ghstack-source-id: 112773552 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23719270 fbshipit-source-id: 306264f62c1de65e733696a912bdb6e9376d5622 --- torch/nn/parallel/distributed.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 44f5e6fe2ccb..790a9d1c2fc4 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -4,6 +4,7 @@ import os import inspect import logging +import warnings import torch @@ -431,7 +432,6 @@ def model_parameters(m): if self.device_ids and len(self.device_ids) > 1: - import warnings warnings.warn( "Single-Process Multi-GPU is not the recommended mode for " "DDP. In this mode, each DDP instance operates on multiple " @@ -815,8 +815,20 @@ def join(self, divide_by_initial_world_size=True, enable=True): if enable and not has_error: all_procs_joined = False is_last_joiner = True - # Schedules allreduce to match fwd pass allreduce in non-joined procs + i = 0 + WARN_THRESHOLD = 1000 + warnings.simplefilter("once") while not all_procs_joined: + if i > WARN_THRESHOLD: + my_rank = dist.get_rank(self.process_group) + warnings.warn( + "Detected uneven input skew of greater " + f"than {WARN_THRESHOLD}. This means that rank {my_rank} " + f"has at least {WARN_THRESHOLD} fewer inputs than " + "other currently active ranks. This level of skew could " + "lead to performance degradation during training." + ) + # Schedules allreduce to match fwd pass allreduce in non-joined procs num_active_procs = self._schedule_shadow_all_reduce_for_fwd_pass() if num_active_procs == 0: all_procs_joined = True @@ -853,6 +865,7 @@ def join(self, divide_by_initial_world_size=True, enable=True): self._match_unused_params_allreduce() # It will push rebuilt params only once during training period self.reducer._push_all_rebuilt_params() + i += 1 # All procs joined. Agree on authoritative rank and broadcast the model. self._sync_final_model(is_last_joiner) From b8eab8cdbdc467bc6ef19381af9387f41e45fb44 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Thu, 24 Sep 2020 10:04:47 -0700 Subject: [PATCH 095/449] [hotfix] typo in NaiveConvolutionTranspose2d.cu (#45224) Summary: Fixes typo in e2f49c8 Fixes https://github.com/pytorch/pytorch/issues/45172 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45224 Reviewed By: ezyang Differential Revision: D23879872 Pulled By: walterddr fbshipit-source-id: c3db6d4c6f2ac0e6887862d4217a79c030647cb9 --- .../cuda/NaiveConvolutionTranspose2d.cu | 5 +- test/test_nn.py | 57 +++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu index 10138f4bced0..13149759926d 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu @@ -684,10 +684,7 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template( // Matrix mulitply per output: input_n = input.select(0, elt); - if (kernel_height == 1 && kernel_width == 1) { - // for 1x1 column skip im2col step - columns.copy_(grad_output_n); - } else { + if (kernel_height != 1 || kernel_width != 1) { // Extract columns: im2col( at::cuda::getCurrentCUDAStream(), diff --git a/test/test_nn.py b/test/test_nn.py index 281425e26782..9618b70ab71c 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -10698,6 +10698,63 @@ def test_contig_wrong_stride_cudnn(self, device): F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device)) F.conv2d(x, torch.randn(1, 16, 1, 1, device=device)) + @onlyCUDA + def test_Conv2d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 5, 5) + conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + + @onlyCUDA + def test_ConvTranspose2d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 5, 5) + conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + + @onlyCUDA + def test_ConvTranspose3d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 3, 5, 5) + conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + def _ordered_sequence(self, device, dtype): """Create ordered list of random sequences""" seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype) From 3f5eee666cfbb5cbc5ced32915658e00b39b40e9 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Thu, 24 Sep 2020 10:23:46 -0700 Subject: [PATCH 096/449] Adjust TF32 tests (#44240) Summary: - The thresholds of some tests are bumped up. Depending on the random generator, sometimes these tests fail with things like 0.0059 is not smaller than 0.005. I ran `test_nn.py` and `test_torch.py` for 10+ times to check these are no longer flaky. - Add `tf32_on_and_off` to new `matrix_exp` tests. - Disable TF32 on test suites other than `test_nn.py` and `test_torch.py` cc: ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/44240 Reviewed By: mruberry Differential Revision: D23882498 Pulled By: ngimel fbshipit-source-id: 44a9ec08802c93a2efaf4e01d7487222478b6df8 --- aten/src/ATen/Context.cpp | 23 ++++++++++++++++++ aten/src/ATen/Context.h | 16 +++++++++++++ aten/src/ATen/cuda/CUDABlas.cpp | 32 +++++++++++++++---------- aten/src/ATen/cuda/CublasHandlePool.cpp | 2 +- aten/src/ATen/native/LinearAlgebra.cpp | 3 +++ test/jit/test_tracer.py | 4 ++++ test/test_jit_fuser.py | 7 ++++++ test/test_nn.py | 1 + test/test_torch.py | 28 +++++++++++++--------- torch/testing/_internal/common_cuda.py | 15 ++++++++++++ torch/testing/_internal/common_nn.py | 21 +++++++++++++--- 11 files changed, 124 insertions(+), 28 deletions(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 1496b6ee551d..18673877c219 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -230,4 +230,27 @@ Allocator* getCPUAllocator() { return getTHDefaultAllocator(); } +// override_allow_tf32_flag = true +// means the allow_tf32 flags are overrided and tf32 is force disabled +// override_allow_tf32_flag = false +// means the original allow_tf32 flags are followed +thread_local bool override_allow_tf32_flag = false; + +NoTF32Guard::NoTF32Guard() { + if (!override_allow_tf32_flag) { + changed = true; + override_allow_tf32_flag = true; + } +} + +NoTF32Guard::~NoTF32Guard() { + if (changed) { + override_allow_tf32_flag = false; + } +} + +bool NoTF32Guard::should_disable_tf32() { + return override_allow_tf32_flag; +} + } // namespace at diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index b8782209def5..fed5e88e5314 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) { } } +// When the global flag `allow_tf32` is set to true, cuBLAS handles are +// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH. +// For some operators, such as addmv, TF32 offers no performance improvement +// but causes precision loss. To help this case, this class implements +// a RAII guard that can be used to quickly disable TF32 within its scope. +// +// Usage: +// NoTF32Guard disable_tf32; +struct TORCH_API NoTF32Guard { + NoTF32Guard(); + ~NoTF32Guard(); + static bool should_disable_tf32(); +private: + bool changed = false; +}; + } // namespace at diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 0311399649e7..d4b7155b0591 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -407,19 +407,22 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #endif #if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 210) - template <> - void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); - cublasOperation_t op = _cublasOpFromChar(trans); - _cublasAdjustLdLevel2(m, n, &lda); - GEMV_CHECK_ARGVALUES(c10::complex); - TORCH_CUDABLAS_CHECK( - cublasCgemv(handle, op, m, n, reinterpret_cast(&alpha), reinterpret_cast(a), - lda, reinterpret_cast(x), incx, reinterpret_cast(&beta), - reinterpret_cast(y), incy)); - } +template <> +void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { + // gemv is bw bound, and does not benefit from TF32. But the precision + // loss still happens on TF32. So we disable it here. + NoTF32Guard disable_tf32; + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + cublasOperation_t op = _cublasOpFromChar(trans); + _cublasAdjustLdLevel2(m, n, &lda); + GEMV_CHECK_ARGVALUES(c10::complex); + TORCH_CUDABLAS_CHECK( + cublasCgemv(handle, op, m, n, reinterpret_cast(&alpha), reinterpret_cast(a), + lda, reinterpret_cast(x), incx, reinterpret_cast(&beta), + reinterpret_cast(y), incy)); +} #endif template <> @@ -436,6 +439,9 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { + // gemv is bw bound, and does not benefit from TF32. But the precision + // loss still happens on TF32. So we disable it here. + NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 404f322545f8..0165c53ac60d 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -45,7 +45,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. - if (at::globalContext().allowTF32CuBLAS()) { + if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index e93eb11f642c..a8bb81b3e222 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1223,6 +1223,8 @@ Tensor matrix_exp(const Tensor& a) { "matrix_exp(", a.scalar_type(), "{", a.sizes(), "}): expected a tensor " "of squared matrices"); + NoTF32Guard disable_tf32; + if (a.size(-1) == 1) { return a.exp(); } @@ -1231,6 +1233,7 @@ Tensor matrix_exp(const Tensor& a) { } Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) { + NoTF32Guard disable_tf32; return backward_analytic_function_of_a_matrix( self, grad, [](const Tensor& a) { diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 518af2f95a4c..24db4cfe857e 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -18,6 +18,7 @@ IS_SANDCASTLE, IS_WINDOWS from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \ _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, RUN_CUDA_MULTI_GPU +from torch.testing._internal.common_cuda import with_tf32_off from typing import List, Tuple from torch import Tensor @@ -900,6 +901,9 @@ def foo(a): self.assertEqual(foo(x), x + x + x) @unittest.skipIf(not RUN_CUDA, "calls .cuda()") + # By default, on Ampere or later GPUs, nn.Linear computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_traced_module_cuda(self): class Model(nn.Module): def __init__(self, num_features, num_layers): diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py index a75da03a6d21..b4efbf12c358 100644 --- a/test/test_jit_fuser.py +++ b/test/test_jit_fuser.py @@ -10,6 +10,7 @@ RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, warmup_backward from textwrap import dedent from itertools import product, permutations +from torch.testing._internal.common_cuda import with_tf32_off from test_jit import backward_graph, all_backward_graphs, get_lstm_inputs, get_milstm_inputs, \ LSTMCellC, LSTMCellF, LSTMCellS, MiLSTMCell @@ -710,6 +711,9 @@ def test_lstm_cuda(self): "aten::_grad_sum_to_size")) @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_lstm_concat_cuda(self): inputs = get_lstm_inputs('cuda') ge = self.checkTrace(LSTMCellC, inputs) @@ -740,6 +744,9 @@ def cell(x, hx, cx, w_ih, w_hh, b_ih, b_hh): # TODO: Fuser doesn't work at all when inputs require grad. Fix that @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_lstm_traced_cuda(self): inputs = get_lstm_inputs('cuda') ge = self.checkTrace(LSTMCellF, inputs) diff --git a/test/test_nn.py b/test/test_nn.py index 9618b70ab71c..8b9bf9156106 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -12018,6 +12018,7 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device): @onlyCUDA @skipCUDAIfRocm @skipCUDAIfCudnnVersionLessThan(7603) + @tf32_on_and_off(0.05) def test_conv_cudnn_mismatch_memory_format(self, device): configs = [ [4, 2, 8, 8, 4, 2], diff --git a/test/test_torch.py b/test/test_torch.py index ee27c8dd65cf..8c355eb93570 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -41,11 +41,10 @@ from typing import Dict, List, Tuple, Union import torch.backends.quantized import torch.testing._internal.data -from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, \ +from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off, \ _get_torch_cuda_version, TEST_MAGMA - # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -7003,6 +7002,9 @@ def test_matrix_exp_boundary_cases(self, device, dtype): @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double) + # Although tf32 is always disabled on matrix_exp, this test uses matmul, + # which has tf32 on by default + @with_tf32_off def test_matrix_exp_analytic(self, device, dtype): # check zero matrix x = torch.zeros(20, 20, dtype=dtype, device=device) @@ -7144,6 +7146,9 @@ def run_test(*n): @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double) + # Although tf32 is always disabled on matrix_exp, this test uses matmul, + # which has tf32 on by default + @with_tf32_off def test_matrix_exp_compare_with_taylor(self, device, dtype): def normalize_to_1_operator_norm(sample, desired_norm): @@ -16471,6 +16476,7 @@ def _test(row_major, incx, incy, lda_tail): @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes()) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + @tf32_on_and_off(0.05) def test_addmm(self, device, dtype): M = torch.randn(10, 25, device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) @@ -19832,13 +19838,13 @@ def inner(self, device, dtype): 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()), ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, - _cpu_types, True, [tf32_on_and_off(0.005)]), + _cpu_types, True, [tf32_on_and_off(0.01)]), ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)), ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], @@ -19865,26 +19871,26 @@ def inner(self, device, dtype): [_wrap_maybe_warns("This overload of addcmul_? is deprecated")]), ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), - _cpu_types, True, [tf32_on_and_off(0.005)], 0, True), + _cpu_types, True, [tf32_on_and_off(0.01)], 0, True), ('addmm', 'scalar', _medium_2d, lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), ('addmm', 'two_scalars', _medium_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), ('addmv', '', _medium_1d, lambda t, d: [_medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, - True, [tf32_on_and_off(0.005)], 0, True), + True, [], 0, True), ('addmv', 'scalar', _medium_1d, lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]), + [_wrap_maybe_warns("This overload of addmv_? is deprecated")]), ('addmv', 'two_scalars', _medium_1d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]), + [_wrap_maybe_warns("This overload of addmv_? is deprecated")]), ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, _float_types2), ('addr', 'scalar', _medium_2d, diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 88f6bf11976c..f0e8c40602c0 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -127,6 +127,21 @@ def wrapped(self, device, dtype): return wrapped return wrapper + +# This is a wrapper that wraps a test to run it with TF32 turned off. +# This wrapper is designed to be used when a test uses matmul or convolutions +# but the purpose of that test is not testing matmul or convolutions. +# Disabling TF32 will enforce torch.float tensors to be always computed +# at full precision. +def with_tf32_off(f): + @functools.wraps(f) + def wrapped(*args, **kwargs): + with tf32_off(): + return f(*args, **kwargs) + + return wrapped + + def _get_torch_cuda_version(): if torch.version.cuda is None: return [0, 0] diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 4b18ec86fa2f..2de86795cda7 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -1601,6 +1601,7 @@ def fractional_max_pool3d_test(test_case): input_size=(2, 4, 10), cudnn=True, with_tf32=True, + tf32_precision=0.005, ), dict( module_name='Conv1d', @@ -1620,6 +1621,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, desc='pad1', with_tf32=True, + tf32_precision=0.005, ), dict( module_name='Conv1d', @@ -1629,6 +1631,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, desc='pad2', with_tf32=True, + tf32_precision=0.005, ), dict( module_name='Conv1d', @@ -1638,6 +1641,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, desc='pad1size1', with_tf32=True, + tf32_precision=0.005, ), dict( module_name='Conv1d', @@ -1647,6 +1651,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, desc='pad2size1', with_tf32=True, + tf32_precision=0.005, ), dict( module_name='Conv1d', @@ -1657,6 +1662,7 @@ def fractional_max_pool3d_test(test_case): desc='zero_batch', test_cuda=(not TEST_WITH_ROCM), with_tf32=True, + tf32_precision=0.005, ), dict( fullname='Conv1d_dilated', @@ -1664,6 +1670,7 @@ def fractional_max_pool3d_test(test_case): cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).dilation(2)', input_size=(2, 4, 10), with_tf32=True, + tf32_precision=0.005, ), dict( fullname='Conv1d_groups', @@ -1672,6 +1679,7 @@ def fractional_max_pool3d_test(test_case): input_size=(2, 4, 6), cudnn=True, with_tf32=True, + tf32_precision=0.005, ), dict( fullname='ConvTranspose1d', @@ -1702,6 +1710,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, desc='dilated', with_tf32=True, + tf32_precision=0.005, ), dict( fullname='ConvTranspose1d_groups', @@ -2117,7 +2126,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, check_with_long_tensor=True, with_tf32=True, - tf32_precision=0.005, + tf32_precision=0.05, ), dict( module_name='Conv3d', @@ -2140,7 +2149,7 @@ def fractional_max_pool3d_test(test_case): desc='stride', check_with_long_tensor=True, with_tf32=True, - tf32_precision=0.005, + tf32_precision=0.05, ), dict( module_name='Conv3d', @@ -2151,7 +2160,7 @@ def fractional_max_pool3d_test(test_case): desc='stride_padding', check_with_long_tensor=True, with_tf32=True, - tf32_precision=0.01, + tf32_precision=0.05, ), dict( module_name='Conv3d', @@ -2180,6 +2189,7 @@ def fractional_max_pool3d_test(test_case): cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2)', input_size=(2, 3, 5, 5, 5), with_tf32=True, + tf32_precision=0.05, ), dict( fullname='Conv3d_dilated_strided', @@ -2187,6 +2197,7 @@ def fractional_max_pool3d_test(test_case): cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)', input_size=(2, 3, 5, 5, 5), with_tf32=True, + tf32_precision=0.05 ), dict( module_name='ConvTranspose3d', @@ -2195,6 +2206,7 @@ def fractional_max_pool3d_test(test_case): cudnn=True, input_size=(1, 2, 4, 5, 4), with_tf32=True, + tf32_precision=0.05 ), dict( module_name='ConvTranspose3d', @@ -2205,6 +2217,7 @@ def fractional_max_pool3d_test(test_case): input_size=(1, 2, 4, 5, 4), desc='dilated', with_tf32=True, + tf32_precision=0.05 ), dict( module_name='MaxPool3d', @@ -5005,6 +5018,8 @@ def __init__(self, *args, **kwargs): self.check_bfloat16 = kwargs.get('check_bfloat16', False) self.convert_target = kwargs.get('convert_target', True) self.test_cpu = kwargs.get('test_cpu', True) + self.with_tf32 = kwargs.get('with_tf32', True) + self.tf32_precision = kwargs.get('tf32_precision', 0.001) def __call__(self, test_case): module = self.constructor(*self.constructor_args) From c79d493096abdb1c87586e41e50c1aa725f28df1 Mon Sep 17 00:00:00 2001 From: Kyle Chen Date: Thu, 24 Sep 2020 11:15:46 -0700 Subject: [PATCH 097/449] added rocm 3.8 docker image (#45205) Summary: jeffdaily Pull Request resolved: https://github.com/pytorch/pytorch/pull/45205 Reviewed By: malfet Differential Revision: D23906606 Pulled By: walterddr fbshipit-source-id: 604a12bf4c97260215a1881cc96e35e7c42b4578 --- .circleci/cimodel/data/simple/docker_definitions.py | 1 + .circleci/config.yml | 3 +++ .circleci/docker/build.sh | 7 +++++++ .circleci/docker/common/install_base.sh | 2 +- 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py index 59944d190383..90d776311601 100644 --- a/.circleci/cimodel/data/simple/docker_definitions.py +++ b/.circleci/cimodel/data/simple/docker_definitions.py @@ -28,6 +28,7 @@ "pytorch-linux-xenial-py3.6-gcc7.2", "pytorch-linux-xenial-py3.6-gcc7", "pytorch-linux-bionic-rocm3.7-py3.6", + "pytorch-linux-bionic-rocm3.8-py3.6", ] diff --git a/.circleci/config.yml b/.circleci/config.yml index c952ee716b3d..b70a090bed72 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6320,6 +6320,9 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-bionic-rocm3.7-py3.6" image_name: "pytorch-linux-bionic-rocm3.7-py3.6" + - docker_build_job: + name: "docker-pytorch-linux-bionic-rocm3.8-py3.6" + image_name: "pytorch-linux-bionic-rocm3.8-py3.6" - pytorch_linux_build: name: pytorch_linux_xenial_py3_6_gcc5_4_build requires: diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 9bfa0b195499..0afc1b33c59e 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -262,6 +262,13 @@ case "$image" in VISION=yes ROCM_VERSION=3.7 ;; + pytorch-linux-bionic-rocm3.8-py3.6) + ANACONDA_PYTHON_VERSION=3.6 + PROTOBUF=yes + DB=yes + VISION=yes + ROCM_VERSION=3.8 + ;; *) # Catch-all for builds that are not hardcoded. PROTOBUF=yes diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh index ac4e1f18f1ef..5e8173a43627 100755 --- a/.circleci/docker/common/install_base.sh +++ b/.circleci/docker/common/install_base.sh @@ -118,7 +118,7 @@ esac # Install Valgrind separately since the apt-get version is too old. mkdir valgrind_build && cd valgrind_build -VALGRIND_VERSION=3.15.0 +VALGRIND_VERSION=3.16.1 if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2 then wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2 From 26001a2334783e083e252e366ec0804c1e12d5e9 Mon Sep 17 00:00:00 2001 From: Xinyu Li Date: Thu, 24 Sep 2020 11:53:58 -0700 Subject: [PATCH 098/449] Revert D23753711: [pytorch][PR] Add foreach APIs for binary ops with ScalarList Test Plan: revert-hammer Differential Revision: D23753711 (https://github.com/pytorch/pytorch/commit/71d1b5b0e227e407e60c0a3dd6a4caabdcd6c89a) Original commit changeset: bf3e8c54bc07 fbshipit-source-id: 192692e0d3fff4cade9983db0a1760fedfc9674c --- aten/src/ATen/native/ForeachOpsKernels.cpp | 24 - aten/src/ATen/native/ForeachUtils.h | 14 - .../native/cuda/ForeachBinaryOpScalarList.cu | 60 -- aten/src/ATen/native/cuda/ForeachFunctors.cuh | 115 ---- .../src/ATen/native/cuda/MultiTensorApply.cuh | 70 --- aten/src/ATen/native/native_functions.yaml | 97 +--- .../check_backward_compatibility.py | 4 - test/test_foreach.py | 529 ++++-------------- test/test_native_functions.py | 2 +- tools/autograd/gen_python_functions.py | 1 - .../templates/python_torch_functions.cpp | 1 - tools/codegen/model.py | 4 - tools/pyi/gen_pyi.py | 1 - torch/csrc/utils/python_arg_parser.cpp | 22 +- torch/csrc/utils/python_arg_parser.h | 18 +- 15 files changed, 119 insertions(+), 843 deletions(-) delete mode 100644 aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 73eb2070c07d..912b5116c4cc 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -24,26 +24,6 @@ std::vector foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor return result; \ } -#define FOREACH_BINARY_OP_SCALARLIST(NAME) \ -void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef scalars) { \ - check_foreach_api_restrictions(tensors, scalars); \ - \ - for (int i = 0; i < tensors.size(); i++) { \ - tensors[i].NAME##_(scalars[i]); \ - } \ -} \ - \ -std::vector foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef scalars) { \ - check_foreach_api_restrictions(tensors, scalars); \ - std::vector result; \ - result.reserve(tensors.size()); \ - for (int i = 0; i < tensors.size(); i++) { \ - result.emplace_back(tensors[i].NAME(scalars[i])); \ - } \ - \ - return result; \ -} - #define FOREACH_BINARY_OP_LIST(NAME) \ std::vector foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) { \ check_foreach_api_restrictions(tensors1, tensors2); \ @@ -137,10 +117,6 @@ FOREACH_BINARY_OP_SCALAR(add); FOREACH_BINARY_OP_SCALAR(sub); FOREACH_BINARY_OP_SCALAR(mul); FOREACH_BINARY_OP_SCALAR(div); -FOREACH_BINARY_OP_SCALARLIST(add); -FOREACH_BINARY_OP_SCALARLIST(sub); -FOREACH_BINARY_OP_SCALARLIST(mul); -FOREACH_BINARY_OP_SCALARLIST(div); FOREACH_BINARY_OP_LIST(mul); FOREACH_BINARY_OP_LIST(div); FOREACH_UNARY_OP(sqrt); diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 44e6a50297db..5a7aced74702 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -31,12 +31,6 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) { } } -void check_foreach_api_restrictions(TensorList tensors, ArrayRef scalars) { - TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); - TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); - TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); -} - // To go via 'fast' path, several conditions must be satisfied // - All tensors must be on the same device // - All tensors must have strided layout @@ -138,13 +132,5 @@ bool can_use_fast_route(TensorList tensors) { return true; } -bool can_use_fast_route(TensorList tensors, ArrayRef scalars) { - TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); - TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); - TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); - - return can_use_fast_route(tensors); -} - } }} // at::native diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu deleted file mode 100644 index 684f12732ffc..000000000000 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include - -namespace at { namespace native { - -template class Op> -std::vector foreach_binary_op(TensorList tensors, at::ArrayRef scalars) { - std::vector> tensor_lists; - std::vector vec_res; - for (const auto& t: tensors) { - vec_res.emplace_back(at::native::empty_like(t)); - } - - tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(vec_res); - - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() { - multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor()); - }); - return tensor_lists[1]; -} - -template class Op> -void foreach_binary_op_(TensorList tensors, at::ArrayRef scalars) { - std::vector> tensor_lists; - tensor_lists.emplace_back(tensors.vec()); - - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() { - multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_()); - }); -} - -#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP) \ -void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef scalars) { \ - check_foreach_api_restrictions(tensors); \ - \ - if (!can_use_fast_route(tensors, scalars)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars); \ - } \ - \ - foreach_binary_op_(tensors, scalars); \ -} \ - \ -std::vector foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef scalars) { \ - check_foreach_api_restrictions(tensors); \ - \ - if (!can_use_fast_route(tensors, scalars)) { \ - return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars); \ - } \ - \ - return foreach_binary_op(tensors, scalars); \ -} - -FOREACH_BINARY_OP_SCALARLIST(add, std::plus); -FOREACH_BINARY_OP_SCALARLIST(sub, std::minus); -FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies); -FOREACH_BINARY_OP_SCALARLIST(div, std::divides); - -}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh index e83eca3dd8e1..a04d27110c9a 100644 --- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh +++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh @@ -118,121 +118,6 @@ struct BinaryOpScalarFunctor { } }; -template class Op> -struct BinaryOpScalarListFunctor_ { - __device__ void operator() ( - int chunk_size, - TensorListScalarListMetadata<1>& tl) { - int tensor_loc = tl.block_to_tensor[blockIdx.x]; - int chunk_idx = tl.block_to_chunk[blockIdx.x]; - int n = tl.sizes[tensor_loc]; - - T* x = (T*)tl.addresses[0][tensor_loc]; - x += chunk_idx * chunk_size; - - double y = tl.scalar_vals[tensor_loc]; - - n -= chunk_idx * chunk_size; - - T r_x[kILP]; - - // to make things simple, we put aligned case in a different code path - if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) { - for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { - // load - load_store(r_x, x, 0 , i_start); -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), y); - } - // store - load_store(x, r_x, i_start, 0); - } - } - else { - for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = 0; - int i = i_start + threadIdx.x + ii * blockDim.x; - if(i < n && i < chunk_size) { - r_x[ii] = x[i]; - } - } -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), y); - } -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - int i = i_start + threadIdx.x + ii * blockDim.x; - if(i < n && i < chunk_size) - x[i] = r_x[ii]; - } - } - } - } -}; - -template class Op> -struct BinaryOpScalarListFunctor { - __device__ void operator() ( - int chunk_size, - TensorListScalarListMetadata<2>& tl) { - int tensor_loc = tl.block_to_tensor[blockIdx.x]; - int chunk_idx = tl.block_to_chunk[blockIdx.x]; - int n = tl.sizes[tensor_loc]; - - T* x = (T*)tl.addresses[0][tensor_loc]; - x += chunk_idx * chunk_size; - - T* out = (T*)tl.addresses[1][tensor_loc]; - out += chunk_idx * chunk_size; - - double y = tl.scalar_vals[tensor_loc]; - - n -= chunk_idx * chunk_size; - - T r_x[kILP]; - - // to make things simple, we put aligned case in a different code path - if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) { - for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { - // load - load_store(r_x, x, 0 , i_start); -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), y); - } - // store - load_store(out, r_x, i_start, 0); - } - } - else { - for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = 0; - int i = i_start + threadIdx.x + ii * blockDim.x; - if(i < n && i < chunk_size) { - r_x[ii] = x[i]; - } - } -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), y); - } -#pragma unroll - for(int ii = 0; ii < kILP; ii++) { - int i = i_start + threadIdx.x + ii * blockDim.x; - if(i < n && i < chunk_size) - out[i] = r_x[ii]; - } - } - } - } -}; - template class Op> struct BinaryOpListAlphaFunctor_ { __device__ void operator() ( diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh index d162af19fd1b..f82a0d9a58c8 100644 --- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh +++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh @@ -26,7 +26,6 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s // TensorListMetadata has to be < 4KB - the limit for kernel launch argument static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; -static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; template struct TensorListMetadata { @@ -36,15 +35,6 @@ template struct TensorListMetadata int block_to_chunk[depth_to_max_blocks[n-1]]; }; -template struct TensorListScalarListMetadata -{ - void* addresses[n][depth_to_max_tensors_scalarlist[n-1]]; - int sizes[depth_to_max_tensors_scalarlist[n-1]]; - double scalar_vals[depth_to_max_tensors_scalarlist[n-1]]; - unsigned char block_to_tensor[depth_to_max_blocks[n-1]]; - int block_to_chunk[depth_to_max_blocks[n-1]]; -}; - template C10_LAUNCH_BOUNDS_1(kBlockSize) __global__ void @@ -59,71 +49,11 @@ multi_tensor_apply_kernel( template void multi_tensor_apply( std::vector>& tensor_lists, - at::ArrayRef scalars, T callable, ArgTypes... args) { TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); - size_t n_tensors = tensor_lists[0].size(); - TensorListScalarListMetadata tensorListMeta; - - int loc_block_info = 0; - int loc_tensor_info = 0; - for(size_t t = 0; t < n_tensors; t++) { - - tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t]; - - tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); - for (int d = 0; d < depth; d++) { - tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); - } - loc_tensor_info++; - - int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize; - for (int chunk = 0; chunk < chunks; chunk++) { - tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; - tensorListMeta.block_to_chunk[loc_block_info] = chunk; - loc_block_info++; - - bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] && - chunk == chunks - 1); - bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]); - bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1); - - if (tensors_full || blocks_full || last_chunk) { - multi_tensor_apply_kernel<<>>( - tensorListMeta, - callable, - args...); - - AT_CUDA_CHECK(cudaGetLastError()); - - // Reset. - loc_block_info = 0; - if(chunk == chunks - 1) { - loc_tensor_info = 0; - } - else { - tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1]; - tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1]; - for(int d = 0; d < depth; d++) { - tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1]; - } - loc_tensor_info = 1; - } - } - } - } - } - -template -void multi_tensor_apply( - std::vector>& tensor_lists, - T callable, - ArgTypes... args) { - TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); - const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); size_t n_tensors = tensor_lists[0].size(); TensorListMetadata tensorListMeta; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8068bc1721df..f5bbb263ed9c 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6187,7 +6187,6 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6195,7 +6194,6 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda_ - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6203,7 +6201,6 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6211,7 +6208,6 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6219,7 +6215,6 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6227,7 +6222,6 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6235,39 +6229,34 @@ CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ -- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full +- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda -- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ -- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full +- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] device_guard: False variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda -- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () device_guard: False variants: function dispatch: @@ -6275,7 +6264,6 @@ CUDA: foreach_tensor_sub_list_kernel_cuda_ - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6283,15 +6271,13 @@ CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ -- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full +- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[] device_guard: False variants: function dispatch: @@ -6299,79 +6285,13 @@ CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ -- func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow - CUDA: foreach_tensor_add_scalarlist_kernel_cuda - -- func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow_ - CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ - -- func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow - CUDA: foreach_tensor_sub_scalarlist_kernel_cuda - -- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow_ - CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ - -- func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow - CUDA: foreach_tensor_div_scalarlist_kernel_cuda - -- func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow_ - CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ - -- func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow - CUDA: foreach_tensor_mul_scalarlist_kernel_cuda - -- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow_ - CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ - - func: _foreach_exp(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6379,7 +6299,6 @@ CUDA: foreach_tensor_exp_cuda - func: _foreach_exp_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6387,7 +6306,6 @@ CUDA: foreach_tensor_exp_cuda_ - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6395,7 +6313,6 @@ CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6403,7 +6320,6 @@ CUDA: foreach_tensor_sqrt_cuda_ - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6411,7 +6327,6 @@ CUDA: foreach_tensor_addcdiv_cuda_ - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6419,7 +6334,6 @@ CUDA: foreach_tensor_addcmul_cuda_ - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6427,7 +6341,6 @@ CUDA: foreach_tensor_addcdiv_cuda - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full device_guard: False variants: function dispatch: diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 4303fc563cfc..739a4de51951 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -99,10 +99,6 @@ ("preprocess", datetime.date(2020, 10, 1)), ("compile", datetime.date(2020, 10, 1)), ("execute", datetime.date(2020, 10, 1)), - ("aten::_foreach_add", datetime.date(2020, 10, 1)), - ("aten::_foreach_sub_", datetime.date(2020, 10, 1)), - ("aten::_foreach_div", datetime.date(2020, 10, 1)), - ("aten::_foreach_sub", datetime.date(2020, 10, 1)), ] diff --git a/test/test_foreach.py b/test/test_foreach.py index 85d79096b2ad..8369ba5b9be5 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -4,30 +4,21 @@ from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm class TestForeach(TestCase): - foreach_bin_ops = [ + bin_ops = [ torch._foreach_add, - torch._foreach_sub, - torch._foreach_mul, - torch._foreach_div, - ] - - foreach_bin_ops_ = [ torch._foreach_add_, + torch._foreach_sub, torch._foreach_sub_, + torch._foreach_mul, torch._foreach_mul_, + torch._foreach_div, torch._foreach_div_, ] - torch_bin_ops = [ - torch.add, - torch.sub, - torch.mul, - torch.div, - ] - def _get_test_data(self, device, dtype, N): if dtype in [torch.bfloat16, torch.bool, torch.float16]: tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)] + elif dtype in torch.testing.get_all_int_dtypes(): tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)] else: @@ -35,39 +26,36 @@ def _get_test_data(self, device, dtype, N): return tensors - def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op): - for N in [30, 300]: - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - - expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)] - res = foreach_op(tensors1, tensors2) - foreach_op_(tensors1, tensors2) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, res) - - def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op): - for N in [30, 300]: - tensors1 = self._get_test_data(device, dtype, N) - expected = [torch_op(tensors1[i]) for i in range(N)] - res = foreach_op(tensors1) - foreach_op_(tensors1) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, expected) - - def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op): - for N in [30, 300]: - tensors = self._get_test_data(device, dtype, N) - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - value = 2 - - expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)] - - res = foreach_op(tensors, tensors1, tensors2, value) - foreach_op_(tensors, tensors1, tensors2, value) - self.assertEqual(res, tensors) - self.assertEqual(tensors, expected) + def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + + expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)] + res = foreach_op(tensors1, tensors2) + foreach_op_(tensors1, tensors2) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, expected) + + def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): + tensors1 = self._get_test_data(device, dtype, N) + expected = [torch_op(tensors1[i]) for i in range(N)] + res = foreach_op(tensors1) + foreach_op_(tensors1) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, expected) + + def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): + tensors = self._get_test_data(device, dtype, N) + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + value = 2 + + expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)] + + res = foreach_op(tensors, tensors1, tensors2, value) + foreach_op_(tensors, tensors1, tensors2, value) + self.assertEqual(res, tensors) + self.assertEqual(tensors, expected) def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): tensors1 = self._get_test_data(device, dtype, N) @@ -75,8 +63,8 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_ alpha = 2 expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)] - res = foreach_op(tensors1, tensors2, alpha=alpha) - foreach_op_(tensors1, tensors2, alpha=alpha) + res = foreach_op(tensors1, tensors2, alpha) + foreach_op_(tensors1, tensors2, alpha) self.assertEqual(res, tensors1) if dtype == torch.bool: @@ -100,7 +88,7 @@ def test_exp(self, device, dtype): @skipCUDAIfRocm @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) def test_addcmul(self, device, dtype): - if self.device_type == 'cpu': + if device == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcmul, @@ -117,7 +105,7 @@ def test_addcdiv(self, device, dtype): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv) return - if self.device_type == 'cpu': + if device == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, @@ -130,372 +118,83 @@ def test_addcdiv(self, device, dtype): # @dtypes(*torch.testing.get_all_dtypes()) def test_int_scalar(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalar = 3 - expected = [torch_bin_op(t, scalar) for t in tensors] - - res = foreach_bin_op(tensors, scalar) - - if dtype == torch.bool: - self.assertEqual(res, expected) - - with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalar) - return - - - if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu": - with self.assertRaisesRegex(RuntimeError, - "can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalar) - return - - # TODO[type promotion]: Fix once type promotion is enabled. - if dtype in torch.testing.integral_types() and self.device_type == 'cuda': - self.assertEqual(res, [e.to(dtype) for e in expected]) - - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, [e.to(dtype) for e in expected]) - else: - self.assertEqual(res, expected) - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, expected) - - # TODO[Fix scalar list]: - # We need to update codegen to correctly handle function overloads with float[] and int[]. - # As optimizers work with float tensors, the result will always be torch.float32 for now. - # Current schema is using 'float[]' as scalar list type. - @dtypes(*torch.testing.get_all_dtypes()) - def test_int_scalarlist(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalars = [1 for _ in range(N)] - expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] - - # we dont support bool and complex types on CUDA for now - if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op_(tensors, scalars) - - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op(tensors, scalars) - return - - res = foreach_bin_op(tensors, scalars) - - if dtype == torch.bool: - self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) - - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalars) - return - - if dtype in torch.testing.integral_types(): - if self.device_type == 'cpu': - self.assertEqual(res, [e.to(torch.float32) for e in expected]) - else: - # TODO[type promotion]: Fix once type promotion is enabled. - self.assertEqual(res, [e.to(dtype) for e in expected]) - else: - self.assertEqual(res, expected) - - if dtype in torch.testing.integral_types() and self.device_type == 'cpu': - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalars) - return - else: - foreach_bin_op_(tensors, scalars) - self.assertEqual(res, tensors) + tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] + int_scalar = 1 + + # bool tensor + 1 will result in int64 tensor + if dtype == torch.bool: + expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)] + else: + expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] + + res = torch._foreach_add(tensors, int_scalar) + self.assertEqual(res, expected) + + if dtype in [torch.bool]: + with self.assertRaisesRegex(RuntimeError, + "result type Long can't be cast to the desired output type Bool"): + torch._foreach_add_(tensors, int_scalar) + else: + torch._foreach_add_(tensors, int_scalar) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_float_scalar(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalar = 3.3 - expected = [torch_bin_op(t, scalar) for t in tensors] - - if dtype == torch.bool: - if foreach_bin_op == torch._foreach_sub: - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op_(tensors, scalar) - - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op(tensors, scalar) - return - - res = foreach_bin_op(tensors, scalar) - self.assertEqual(res, expected) - - if dtype in torch.testing.integral_types(): - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalar) - return - - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, expected) + tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] + float_scalar = 1. - @dtypes(*torch.testing.get_all_dtypes()) - def test_float_scalarlist(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalars = [1.1 for _ in range(N)] - expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] - - # we dont support bool and complex types on CUDA for now - if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op_(tensors, scalars) - - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op(tensors, scalars) - return - - res = foreach_bin_op(tensors, scalars) - - if dtype == torch.bool: - # see TODO[Fix scalar list] - self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) - - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalars) - return - - if dtype in torch.testing.integral_types() and self.device_type == 'cuda': - # see TODO[Fix scalar list] - self.assertEqual(res, [e.to(dtype) for e in expected]) - - foreach_bin_op_(tensors, scalars) - self.assertEqual(tensors, res) - return - else: - self.assertEqual(res, expected) - - if dtype in torch.testing.integral_types() and self.device_type == "cpu": - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalars) - return - - foreach_bin_op_(tensors, scalars) - self.assertEqual(tensors, expected) + # float scalar + integral tensor will result in float tensor + if dtype in [torch.uint8, torch.int8, torch.int16, + torch.int32, torch.int64, torch.bool]: + expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)] + else: + expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] + + res = torch._foreach_add(tensors, float_scalar) + self.assertEqual(res, expected) + + if dtype in [torch.uint8, torch.int8, torch.int16, + torch.int32, torch.int64, torch.bool]: + self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar)) + else: + torch._foreach_add_(tensors, float_scalar) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_complex_scalar(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalar = 3 + 5j - expected = [torch_bin_op(t, scalar) for t in tensors] - - if dtype == torch.bool: - if foreach_bin_op == torch._foreach_sub: - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op_(tensors, scalar) - - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op(tensors, scalar) - return - - if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \ - self.device_type == 'cuda': - with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): - foreach_bin_op_(tensors, scalar) - - with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): - foreach_bin_op(tensors, scalar) - return - - res = foreach_bin_op(tensors, scalar) - self.assertEqual(res, expected) - - if dtype not in [torch.complex64, torch.complex128]: - with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): - foreach_bin_op_(tensors, scalar) - else: - foreach_bin_op_(tensors, scalar) - self.assertEqual(res, tensors) + tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] + complex_scalar = 3 + 5j - @dtypes(*torch.testing.get_all_dtypes()) - def test_complex_scalarlist(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalars = [3 + 5j for _ in range(N)] - expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] - - if dtype == torch.bool: - if foreach_bin_op == torch._foreach_sub: - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op_(tensors, scalar) - - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): - foreach_bin_op(tensors, scalar) - return - - with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): - res = foreach_bin_op(tensors, scalars) - - with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): - foreach_bin_op_(tensors, scalars) + # bool tensor + 1 will result in int64 tensor + expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)] + + if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0': + # value cannot be converted to dtype without overflow: + self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) + self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar)) + return + + res = torch._foreach_add(tensors, complex_scalar) + self.assertEqual(res, expected) + + if dtype not in [torch.complex64, torch.complex128]: + self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) + else: + torch._foreach_add_(tensors, complex_scalar) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_bool_scalar(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalar = True - - if dtype == torch.bool: - expected = [torch_bin_op(t, scalar) for t in tensors] - res = foreach_bin_op(tensors, scalar) - - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, res) - return - - if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu": - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): - res = foreach_bin_op(tensors, scalar) - - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): - foreach_bin_op_(tensors, scalar) - elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda': - res = foreach_bin_op(tensors, scalar) - self.assertEqual(res, foreach_bin_op(tensors, 1)) - - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, res) - else: - expected = [torch_bin_op(t, scalar) for t in tensors] - res = foreach_bin_op(tensors, scalar) - - # TODO[type promotion]: Fix once type promotion is enabled. - if dtype in torch.testing.integral_types() and self.device_type == 'cuda': - self.assertEqual(res, [e.to(dtype) for e in expected]) - else: - self.assertEqual(res, expected) - - if dtype in torch.testing.integral_types(): - if foreach_bin_op == torch._foreach_div and self.device_type == "cpu": - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): - foreach_bin_op_(tensors, scalar) - else: - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, res) - else: - foreach_bin_op_(tensors, scalar) - self.assertEqual(tensors, expected) + tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] + bool_scalar = True - @dtypes(*torch.testing.get_all_dtypes()) - def test_bool_scalarlist(self, device, dtype): - for N in [30, 300]: - for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, - self.foreach_bin_ops_, - self.torch_bin_ops): - tensors = self._get_test_data(device, dtype, N) - scalars = [True for _ in range(N)] - - if dtype == torch.bool: - if self.device_type == 'cuda': - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op(tensors, scalars) - - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op_(tensors, scalars) - return - else: - if foreach_bin_op == torch._foreach_sub: - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): - foreach_bin_op_(tensors, scalars) - - with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): - foreach_bin_op(tensors, scalars) - else: - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"): - foreach_bin_op_(tensors, scalars) - - res = foreach_bin_op(tensors, scalars) - for r in res: - self.assertTrue(r.dtype == torch.float32) - else: - # we dont support bool and complex types on CUDA for now - if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda': - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op_(tensors, scalars) - - with self.assertRaisesRegex(RuntimeError, "not implemented for"): - foreach_bin_op(tensors, scalars) - return - - if foreach_bin_op == torch._foreach_sub: - if self.device_type == "cpu": - # see TODO[Fix scalar list] - res = foreach_bin_op(tensors, scalars) - if dtype in torch.testing.integral_types(): - self.assertEqual(res, [r.to(torch.float32) for r in foreach_bin_op(tensors, 1)]) - - with self.assertRaisesRegex(RuntimeError, "esult type Float can't be cast to the "): - foreach_bin_op_(tensors, scalars) - else: - self.assertEqual(res, foreach_bin_op(tensors, 1)) - foreach_bin_op_(tensors, scalars) - self.assertEqual(res, tensors) - else: - # see TODO[Fix scalar list] - res = foreach_bin_op(tensors, scalars) - if dtype in torch.testing.integral_types(): - self.assertEqual(res, [r.to(dtype) for r in foreach_bin_op(tensors, 1)]) - else: - self.assertEqual(res, foreach_bin_op(tensors, 1)) - - foreach_bin_op_(tensors, scalars) - self.assertEqual(res, tensors) - else: - if self.device_type == "cpu": - expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] - res = foreach_bin_op(tensors, scalars) - - # see TODO[Fix scalar list] - if dtype in torch.testing.integral_types(): - self.assertEqual(res, [e.to(torch.float32) for e in expected]) - else: - self.assertEqual(res, expected) - - if dtype in torch.testing.integral_types(): - with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): - foreach_bin_op_(tensors, scalars) - else: - foreach_bin_op_(tensors, scalars) - self.assertEqual(tensors, expected) - else: - expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] - res = foreach_bin_op(tensors, scalars) - - if dtype in torch.testing.integral_types(): - self.assertEqual(res, [e.to(dtype) for e in expected]) - else: - self.assertEqual(res, expected) - - foreach_bin_op_(tensors, scalars) - self.assertEqual(res, tensors) + expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] + + res = torch._foreach_add(tensors, bool_scalar) + self.assertEqual(res, expected) + + torch._foreach_add_(tensors, bool_scalar) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_add_with_different_size_tensors(self, device, dtype): @@ -549,9 +248,9 @@ def test_add_list_error_cases(self, device): # One empty list tensors1.append(torch.tensor([1], device=device)) - with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."): + with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."): torch._foreach_add(tensors1, tensors2) - with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."): + with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."): torch._foreach_add_(tensors1, tensors2) # Lists have different amount of tensors @@ -619,25 +318,13 @@ def test_div_list(self, device, dtype): self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489") return - for N in [30, 300]: - tensors1 = self._get_test_data(device, dtype, N) - - if dtype in [torch.bfloat16, torch.bool, torch.float16]: - tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)] - else: - tensors2 = self._get_test_data(device, dtype, N) - - expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)] - res = torch._foreach_div(tensors1, tensors2) - torch._foreach_div_(tensors1, tensors2) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, res) + self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div) def test_bin_op_list_error_cases(self, device): tensors1 = [] tensors2 = [] - for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_: + for bin_op in self.bin_ops: # Empty lists with self.assertRaises(RuntimeError): bin_op(tensors1, tensors2) diff --git a/test/test_native_functions.py b/test/test_native_functions.py index e5afc79f037a..869c7aad47fb 100644 --- a/test/test_native_functions.py +++ b/test/test_native_functions.py @@ -58,7 +58,7 @@ def fake_module(values, const): self.do_test_optional_floatlist_with_module(fake_module) def test_optional_floatlist_invalid(self): - with self.assertRaisesRegex(TypeError, "must be tuple of floats, not list"): + with self.assertRaisesRegex(TypeError, "must be .* but found"): FloatListWrapperModule()(torch.zeros(1), ["hi"]) with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"): diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 8f272de9a5f6..995dff38030b 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -281,7 +281,6 @@ def create_python_bindings(python_functions, is_python_method, module): 'c10::optional': 'toBoolOptional', 'c10::optional': 'toDoubleOptional', 'c10::optional>': 'doublelistOptional', - 'ArrayRef': 'doublelist', 'IntArrayRef': 'intlist', 'Scalar': 'scalar', 'ScalarType': 'scalartype', diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 673af99bce77..62e9b8dd227f 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -44,7 +44,6 @@ using at::Generator; using at::TensorList; using at::Dimname; using at::DimnameList; -using at::ArrayRef; using namespace torch::autograd::utils; diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 4ec0dc428b81..b0c470c91b6a 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -304,10 +304,6 @@ def __post_init__(self) -> None: # TODO: fixme if str(self.name) not in [ '_amp_non_finite_check_and_unscale_', - '_foreach_add_.ScalarList', - '_foreach_sub_.ScalarList', - '_foreach_mul_.ScalarList', - '_foreach_div_.ScalarList', '_foreach_add_.Scalar', '_foreach_sub_.Scalar', '_foreach_mul_.Scalar', diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index d24966f9fb52..7079c6750223 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -146,7 +146,6 @@ def type_to_python(typename, size=None): 'Dimname': 'Union[str, ellipsis, None]', 'DimnameList': 'Sequence[Union[str, ellipsis, None]]', 'QScheme': '_qscheme', - 'ArrayRef' : 'Sequence[float]' }[typename] return typename diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index f9e26af63ada..e954bef398e9 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -366,23 +366,6 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector return true; } -bool is_float_list(PyObject* obj) { - auto tuple = six::isTuple(obj); - if (!(tuple || PyList_Check(obj))) { - return false; - } - - auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj); - if (size > 0) { - PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0); - if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) { - return false; - } - } - - return true; -} - // argnum is needed for raising the TypeError, it's used in the error message. auto FunctionParameter::check(PyObject* obj, std::vector &overloaded_args, int argnum) -> bool { @@ -437,9 +420,7 @@ auto FunctionParameter::check(PyObject* obj, std::vector &overloaded // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int return size > 0 && THPUtils_checkLong(obj); } - case ParameterType::FLOAT_LIST: { - return is_float_list(obj); - } + case ParameterType::FLOAT_LIST: return (PyTuple_Check(obj) || PyList_Check(obj)); case ParameterType::GENERATOR: return THPGenerator_Check(obj); case ParameterType::BOOL: return PyBool_Check(obj); case ParameterType::STORAGE: return isStorage(obj); @@ -920,7 +901,6 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject* print_error(self, args, kwargs, parsed_args); } - void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) { // NOLINT auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0); std::vector plausible_idxs; diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index d0e2bdc074ff..78efb6cf2db3 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -173,8 +173,6 @@ struct PythonArgs { inline c10::optional toBoolOptional(int i); inline c10::optional toDoubleOptional(int i); inline c10::OptionalArray doublelistOptional(int i); - inline std::vector doublelist(int i); - inline std::vector getDoublelist(int i); inline at::Layout layout(int i); inline at::Layout layoutWithDefault(int i, at::Layout default_layout); inline c10::optional layoutOptional(int i); @@ -371,7 +369,10 @@ inline c10::OptionalArray PythonArgs::intlistOptional(int i) { return intlist(i); } -inline std::vector PythonArgs::getDoublelist(int i) { +inline c10::OptionalArray PythonArgs::doublelistOptional(int i) { + if (!args[i]) { + return {}; + } PyObject* arg = args[i]; auto tuple = PyTuple_Check(arg); auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg); @@ -389,17 +390,6 @@ inline std::vector PythonArgs::getDoublelist(int i) { return res; } -inline c10::OptionalArray PythonArgs::doublelistOptional(int i) { - if (!args[i]) { - return {}; - } - return this->getDoublelist(i); -} - -inline std::vector PythonArgs::doublelist(int i) { - return this->getDoublelist(i); -} - inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) { if (!args[i]) return default_scalartype; return scalartype(i); From c211a9102f20cf85eba3a395a20567baa73c764f Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 24 Sep 2020 11:54:41 -0700 Subject: [PATCH 099/449] add rocm 3.8 to nightly builds (#45222) Summary: Corresponding change in builder repo: https://github.com/pytorch/builder/pull/528. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45222 Reviewed By: ezyang Differential Revision: D23894831 Pulled By: walterddr fbshipit-source-id: c6a256ec325ddcf5836b4d293f546368d58db538 --- .circleci/cimodel/data/binary_build_data.py | 6 +- .circleci/cimodel/data/dimensions.py | 5 +- .circleci/config.yml | 156 ++++++++++++++++++++ 3 files changed, 163 insertions(+), 4 deletions(-) diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py index 58fbbd08f994..21b6eebef5a1 100644 --- a/.circleci/cimodel/data/binary_build_data.py +++ b/.circleci/cimodel/data/binary_build_data.py @@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version): )), # Skip CUDA-9.2 builds on Windows windows=( - [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]], + [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS], OrderedDict( wheel=dimensions.STANDARD_PYTHON_VERSIONS, conda=dimensions.STANDARD_PYTHON_VERSIONS, @@ -142,11 +142,11 @@ def get_children(self): # XXX disabling conda rocm build since docker images are not there if self.find_prop("package_format") == 'conda': - gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions) + gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions) # XXX libtorch rocm build is temporarily disabled if self.find_prop("package_format") == 'libtorch': - gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions) + gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions) return [ArchConfigNode(self, v) for v in gpu_versions] diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py index 93d4d645a53a..1f83cd61b13c 100644 --- a/.circleci/cimodel/data/dimensions.py +++ b/.circleci/cimodel/data/dimensions.py @@ -9,9 +9,12 @@ ROCM_VERSIONS = [ "3.7", + "3.8", ] -GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS] +ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS] + +GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS STANDARD_PYTHON_VERSIONS = [ "3.6", diff --git a/.circleci/config.yml b/.circleci/config.yml index b70a090bed72..700a4155441d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2130,6 +2130,39 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-rocm:3.7" + - binary_linux_build: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" + - binary_linux_build: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" + - binary_linux_build: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" - binary_linux_build: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build build_environment: "conda 3.6 cpu devtoolset7" @@ -3429,6 +3462,51 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test build_environment: "conda 3.6 cpu devtoolset7" @@ -4932,6 +5010,48 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: rocm3.7 + - binary_upload: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 + - binary_upload: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 + - binary_upload: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 - binary_upload: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload context: org-member @@ -7458,6 +7578,42 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly build_environment: "conda 3.6 cpu devtoolset7" From c3a5aed5f7f13f193c7444b1c7af12344e8ce964 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 24 Sep 2020 12:10:10 -0700 Subject: [PATCH 100/449] Run pytorch_core CUDA tests on GPU using TPX Summary: Modify contbuild to disable sanitizers, add option to run "cuda" test using TPX RE (Note: this ignores all push blocking failures!) Test Plan: CI Reviewed By: walterddr, cspanda Differential Revision: D23854578 fbshipit-source-id: 327d7cc3655c17034a6a7bc78f69967403290623 --- test/test_cuda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index 2d23954cfcf8..6c904a67e619 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -21,7 +21,7 @@ from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \ _compare_trilu_indices, _compare_large_trilu_indices from torch.testing._internal.common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \ - NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, \ + NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, \ slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY from torch.testing._internal.autocast_test_lists import AutocastTestLists @@ -1732,6 +1732,7 @@ def test_streaming_backwards_device_transfer(self): self.assertTrue(b.grad.sum().item() == 4 * size) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @unittest.skipIf(not IS_SANDCASTLE, "Does not work on Sandcastle") def test_cuda_init_race(self): # See https://github.com/pytorch/pytorch/issues/16559 import subprocess From e2bcdc7b697de757991664d81735d797e70be59b Mon Sep 17 00:00:00 2001 From: Xiaomeng Yang Date: Thu, 24 Sep 2020 12:28:12 -0700 Subject: [PATCH 101/449] [Caffe2] Fix LayerNormOp when batch_size == 0. (#45250) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45250 [Caffe2] Fix LayerNormOp when batch_size == 0. Test Plan: buck test mode/dev-nosan //caffe2/caffe2/python/operator_test:layer_norm_op_test Reviewed By: houseroad Differential Revision: D23892091 fbshipit-source-id: 9a34654dd6880c9d14b7111fcf850e4f48ffdf91 --- caffe2/operators/layer_norm_op.h | 15 ++++++++++ .../operator_test/layer_norm_op_test.py | 28 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h index e1e8ec0693d6..543ad8dd0b34 100644 --- a/caffe2/operators/layer_norm_op.h +++ b/caffe2/operators/layer_norm_op.h @@ -52,6 +52,11 @@ class LayerNormOp final : public Operator { T* sigma_data = sigma->template mutable_data(); T* scale_data = scale_.template mutable_data(); T* bias_data = bias_.template mutable_data(); + + if (M == 0) { + return true; + } + const std::array X_dims = {M, N}; const std::array Y_dims = {M, 1}; math::Moments( @@ -174,6 +179,16 @@ class LayerNormGradientOp final : public Operator { g_scale_data = g_scale_.template mutable_data(); } + if (M == 0) { + if (N > 0 && dgamma_data != nullptr) { + math::Set(N, T(0), dgamma_data, &context_); + } + if (N > 0 && dbeta_data != nullptr) { + math::Set(N, T(0), dbeta_data, &context_); + } + return true; + } + ComputeInternalGradients( M, N, dY_data, X_data, gamma_data, dX_data, ds_data, db_data); ComputeFusedParams( diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 56cd72d69991..62e94afe9e7d 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -373,6 +373,34 @@ def test_layer_norm_brew_wrapper(self, X, gc, dc): self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() + @given(N=st.integers(1, 10), elementwise_affine=st.booleans(), **hu.gcs) + @settings(deadline=None) + def test_layer_norm_with_empty_batch(self, N, elementwise_affine, gc, dc): + X = np.random.randn(0, N).astype(np.float32) + gamma = np.random.rand(N).astype(np.float32) + beta = np.random.rand(N).astype(np.float32) + + op = core.CreateOperator( + "LayerNorm", + ["X", "gamma", "beta"] if elementwise_affine else ["X"], + ["Y", "mean", "sigma"], + elementwise_affine=elementwise_affine, + ) + + def ref(X, gamma=None, beta=None): + Y = np.zeros_like(X) + axis = 1 + mean = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype) + sigma = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype) + return Y, mean, sigma + + + inputs = [X, gamma, beta] if elementwise_affine else [X] + self.assertReferenceChecks(gc, op, inputs, ref) + self.assertDeviceChecks(dc, op, inputs, [0, 1]) + for i in range(len(inputs)): + self.assertGradientChecks(gc, op, inputs, i, [0]) + if __name__ == "__main__": unittest.main() From 022ba5a78bc18fd1947cd666d5eab7dbb4eb7328 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 24 Sep 2020 13:27:57 -0700 Subject: [PATCH 102/449] Make ddp_comm_hook_wrapper a private method. (#44643) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44643 This method is not used anywhere else. Also formatted the file. Test Plan: buck test caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks Reviewed By: pritamdamania87 Differential Revision: D23675945 fbshipit-source-id: 2d04f94589a20913e46b8d71e6a39b70940c1461 --- .../algorithms/ddp_comm_hooks/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py index 51678fe44590..6b07e23c9476 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py @@ -6,24 +6,27 @@ from torch.nn.parallel import DistributedDataParallel -def ddp_comm_hook_wrapper(comm_hook, model, state): +def _ddp_comm_hook_wrapper(comm_hook, model, state): model._register_comm_hook(state, comm_hook) class DDPCommHookType(Enum): - ''' + """ DDPCommHookType enumerates the hooks of ``torch.distributed.algorithms.ddp_comm_hooks`` as names and ``ddp_comm_hook_wrapper`` partials with hook specified. As an example, you can register allreduce hook by ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``. - ''' - ALLREDUCE = partial(ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook) - FP16_COMPRESS = partial(ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook) + """ + + ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook) + FP16_COMPRESS = partial( + _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook + ) QUANTIZE_PER_TENSOR = partial( - ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook + _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook ) QUANTIZE_PER_CHANNEL = partial( - ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook + _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook ) From cbe1eac1f48329b4706149cd3a05898213542398 Mon Sep 17 00:00:00 2001 From: Danny Huang Date: Thu, 24 Sep 2020 14:20:23 -0700 Subject: [PATCH 103/449] [caffe2] adds Cancel to SafeDequeueBlobsOp and SafeEnqueueBlobsOp (#45177) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45177 ## Motivation * To be able to make C2 ops cancellable so we can safely exit. * Some C2 operators are now blocking thus being non-cancellable. If an error occurs we need to be able to safely stop all net execution so we can throw the exception to the caller. ## Summary * When an error occurs in a net or it got cancelled, running ops will have the `Cancel` method called. This diff adds `Cancel` method to the `SafeEnqueueBlobsOp` and `SafeDequeueBlobsOp` to have the call queue->close() to force all the blocking ops to return. * Adds unit test that verified the error propagation. Test Plan: ## Unit test added to verify that queue ops propagate errors ``` buck test caffe2/caffe2/python:hypothesis_test -- test_safe_dequeue_blob__raises_exception_when_hang --stress-runs 1000 ``` ``` Summary Pass: 1000 ListingSuccess: 1 ``` Reviewed By: d4l3k Differential Revision: D23846967 fbshipit-source-id: c7ddd63259e033ed0bed9df8e1b315f87bf59394 --- caffe2/queue/queue_ops.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h index 64ab19937929..bb70e0f85885 100644 --- a/caffe2/queue/queue_ops.h +++ b/caffe2/queue/queue_ops.h @@ -113,6 +113,12 @@ class SafeEnqueueBlobsOp final : public Operator { 1, !status, Output(size)->template mutable_data(), &context_); return true; } + + void Cancel() override { + auto queue = Operator::Inputs()[0] + ->template Get>(); + queue->close(); + } }; template @@ -192,6 +198,12 @@ class SafeDequeueBlobsOp final : public Operator { return true; } + void Cancel() override { + auto queue = Operator::Inputs()[0] + ->template Get>(); + queue->close(); + } + private: int numRecords_; std::vector blobs_; From 71e6ce66166bb74dbec0fffcdfc72b5fb0e6f9d5 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Thu, 24 Sep 2020 14:29:55 -0700 Subject: [PATCH 104/449] [JIT] Specialize AutogradZero: merge AutogradAnyNonZero and Not(AutogradAnyNonZero) checks into one. (#44987) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44987 This PR introduces new `prim::AutogradAllZero` and `prim::AutogradAllNonZero` ops that are used for a batch check for multiple tensors. The specialize-autogradzero pass now generates one check for all expected-to-be-undefined tensors, one check for all expected-to-be-defined tensors, and a bunch of checks for size parameters passed to `grad_sum_to_size` (this probably could be cleaned up somehow as well in future). An example of what we generated before this change: ``` %1626 : bool = prim::AutogradAnyNonZero(%0) %1627 : bool = prim::AutogradAnyNonZero(%2) %1628 : bool = aten::__not__(%1627) %1629 : bool = prim::AutogradAnyNonZero(%3) %1630 : bool = aten::__not__(%1629) %1631 : bool = prim::AutogradAnyNonZero(%4) %1632 : bool = aten::__not__(%1631) %1633 : bool = prim::AutogradAnyNonZero(%5) %1634 : bool = aten::__not__(%1633) %1635 : bool = prim::AutogradAnyNonZero(%6) %1636 : bool = aten::__not__(%1635) %1637 : bool = prim::AutogradAnyNonZero(%7) %1638 : bool = aten::__not__(%1637) %1639 : bool = prim::AutogradAnyNonZero(%8) %1640 : bool = aten::__not__(%1639) %1641 : bool = prim::AutogradAnyNonZero(%9) %1642 : bool = aten::__not__(%1641) %1643 : bool = prim::AutogradAnyNonZero(%10) %1644 : bool = aten::__not__(%1643) %1645 : bool = prim::AutogradAnyNonZero(%11) %1646 : bool = aten::__not__(%1645) %1647 : bool = prim::AutogradAnyNonZero(%12) %1648 : bool = aten::__not__(%1647) %1649 : bool = prim::AutogradAnyNonZero(%13) %1650 : bool = aten::__not__(%1649) %1651 : bool = prim::AutogradAnyNonZero(%14) %1652 : bool = aten::__not__(%1651) %1653 : bool = prim::AutogradAnyNonZero(%15) %1654 : bool = aten::__not__(%1653) %1655 : bool = prim::AutogradAnyNonZero(%16) %1656 : bool = aten::__not__(%1655) %1657 : bool = prim::AutogradAnyNonZero(%17) %1658 : bool = prim::AutogradAnyNonZero(%18) %1659 : bool = prim::AutogradAnyNonZero(%19) %1660 : bool = prim::AutogradAnyNonZero(%20) %1661 : bool = aten::__is__(%self_size.16, %1625) %1662 : bool = aten::__is__(%other_size.16, %1625) %1663 : bool = aten::__is__(%self_size.14, %1625) %1664 : bool = aten::__is__(%self_size.12, %1625) %1665 : bool = prim::AutogradAnyNonZero(%ingate.7) %1666 : bool = prim::AutogradAnyNonZero(%forgetgate.7) %1667 : bool = prim::AutogradAnyNonZero(%cellgate.7) %1668 : bool = prim::AutogradAnyNonZero(%30) %1669 : bool = prim::AutogradAnyNonZero(%31) %1670 : bool = aten::__is__(%self_size.10, %1625) %1671 : bool = aten::__is__(%other_size.10, %1625) %1672 : bool = prim::AutogradAnyNonZero(%34) %1673 : bool = prim::AutogradAnyNonZero(%35) %1674 : bool = aten::__is__(%self_size.8, %1625) %1675 : bool = aten::__is__(%other_size.8, %1625) %1676 : bool = aten::__is__(%self_size.6, %1625) %1677 : bool = aten::__is__(%other_size.6, %1625) %1678 : bool = prim::AutogradAnyNonZero(%outgate.7) %1679 : bool = prim::AutogradAnyNonZero(%41) %1680 : bool = prim::AutogradAnyNonZero(%42) %1681 : bool = prim::AutogradAnyNonZero(%43) %1682 : bool = aten::__is__(%self_size.4, %1625) %1683 : bool = aten::__is__(%other_size.4, %1625) %1684 : bool[] = prim::ListConstruct(%1626, %1628, %1630, %1632, %1634, %1636, %1638, %1640, %1642, %1644, %1646, %1648, %1650, %1652, %1654, %1656, %1657, %1658, %1659, %1660, %1661, %1662, %1663, %1664, %1665, %1666, %1667, %1668, %1669, %1670, %1671, %1672, %1673, %1674, %1675, %1676, %1677, %1678, %1679, %1680, %1681, %1682, %1683) %1685 : bool = aten::all(%1684) ``` Same example after this change: ``` %1625 : None = prim::Constant() %1626 : bool = aten::__is__(%self_size.16, %1625) %1627 : bool = aten::__is__(%other_size.16, %1625) %1628 : bool = aten::__is__(%self_size.14, %1625) %1629 : bool = aten::__is__(%self_size.12, %1625) %1630 : bool = aten::__is__(%self_size.10, %1625) %1631 : bool = aten::__is__(%other_size.10, %1625) %1632 : bool = aten::__is__(%self_size.8, %1625) %1633 : bool = aten::__is__(%other_size.8, %1625) %1634 : bool = aten::__is__(%self_size.6, %1625) %1635 : bool = aten::__is__(%other_size.6, %1625) %1636 : bool = aten::__is__(%self_size.4, %1625) %1637 : bool = aten::__is__(%other_size.4, %1625) %1638 : bool = prim::AutogradAllNonZero(%0, %17, %18, %19, %20, %ingate.7, %forgetgate.7, %cellgate.7, %30, %31, %34, %35, %outgate.7, %41, %42, %43) %1639 : bool = prim::AutogradAllZero(%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16) %1640 : bool[] = prim::ListConstruct(%1626, %1627, %1628, %1629, %1630, %1631, %1632, %1633, %1634, %1635, %1636, %1637, %1638, %1639) %1641 : bool = aten::all(%1640) ``` My performance measurements showed some changes, but I don't really trust them and think that they are probably just a noise. Below are tables with min-aggregation over 10 runs: FastRNN models: | name | base time (s) | diff time (s) | % change | | :--- | ---: | ---: | ---: | | lstm[aten]:bwd | 30.059927 | 29.834089 | -0.8% | | lstm[aten]:fwd | 25.673708 | 25.700039 | 0.1% | | lstm[cudnn]:bwd | 17.866232 | 17.893120 | 0.2% | | lstm[cudnn]:fwd | 11.418444 | 11.408514 | -0.1% | | lstm[jit]:bwd | 27.127205 | 27.141029 | 0.1% | | lstm[jit]:fwd | 17.018047 | 16.975451 | -0.3% | | lstm[jit_multilayer]:bwd | 27.502396 | 27.365149 | -0.5% | | lstm[jit_multilayer]:fwd | 16.918591 | 16.917767 | -0.0% | | lstm[jit_premul]:bwd | 22.281199 | 22.215082 | -0.3% | | lstm[jit_premul]:fwd | 14.848708 | 14.896231 | 0.3% | | lstm[jit_premul_bias]:bwd | 20.761206 | 21.170969 | 2.0% | | lstm[jit_premul_bias]:fwd | 15.013515 | 15.037978 | 0.2% | | lstm[jit_simple]:bwd | 26.715771 | 26.697786 | -0.1% | | lstm[jit_simple]:fwd | 16.675898 | 16.545893 | -0.8% | | lstm[py]:bwd | 56.327065 | 54.731030 | -2.8% | | lstm[py]:fwd | 39.876324 | 39.230572 | -1.6% | Torch Hub models: | name | base time (s) | diff time (s) | % change | | :--- | ---: | ---: | ---: | | test_eval[BERT_pytorch-cuda-jit] | 0.111706 | 0.106604 | -4.6% | | test_eval[LearningToPaint-cuda-jit] | 0.002841 | 0.002801 | -1.4% | | test_eval[Super_SloMo-cuda-jit] | 0.384869 | 0.384737 | -0.0% | | test_eval[attension_is_all_you_nee...-cuda-jit] | 0.123857 | 0.123923 | 0.1% | | test_eval[demucs-cuda-jit] | 0.077270 | 0.076878 | -0.5% | | test_eval[fastNLP-cuda-jit] | 0.000255 | 0.000249 | -2.3% | | test_eval[moco-cuda-jit] | 0.426472 | 0.427380 | 0.2% | | test_eval[pytorch_CycleGAN_and_pix...-cuda-jit] | 0.026483 | 0.026423 | -0.2% | | test_eval[pytorch_mobilenet_v3-cuda-jit] | 0.036202 | 0.035853 | -1.0% | | test_eval[pytorch_struct-cuda-jit] | 0.001439 | 0.001495 | 3.9% | | test_train[BERT_pytorch-cuda-jit] | 0.247236 | 0.247188 | -0.0% | | test_train[Background_Matting-cuda-jit] | 3.536659 | 3.581864 | 1.3% | | test_train[LearningToPaint-cuda-jit] | 0.015341 | 0.015331 | -0.1% | | test_train[Super_SloMo-cuda-jit] | 1.018626 | 1.019098 | 0.0% | | test_train[attension_is_all_you_nee...-cuda-jit] | 0.446314 | 0.444893 | -0.3% | | test_train[demucs-cuda-jit] | 0.169647 | 0.169846 | 0.1% | | test_train[fastNLP-cuda-jit] | 0.001990 | 0.001978 | -0.6% | | test_train[moco-cuda-jit] | 0.855323 | 0.856974 | 0.2% | | test_train[pytorch_mobilenet_v3-cuda-jit] | 0.497723 | 0.485416 | -2.5% | | test_train[pytorch_struct-cuda-jit] | 0.309692 | 0.308792 | -0.3% | Differential Revision: D23794659 Test Plan: Imported from OSS Reviewed By: bertmaher Pulled By: ZolotukhinM fbshipit-source-id: 859b68868ef839c5c6cbc7021879ee22d3144ea8 --- aten/src/ATen/core/interned_strings.h | 2 ++ .../jit/passes/specialize_autogradzero.cpp | 23 ++++++++++--- torch/csrc/jit/runtime/operator.cpp | 2 ++ torch/csrc/jit/runtime/profiling_record.cpp | 2 ++ .../jit/runtime/register_prim_ops_fulljit.cpp | 32 +++++++++++++++++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index bce5b27e37b1..b279a2400350 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -59,6 +59,8 @@ namespace c10 { _(prim, Store) \ _(prim, AutogradZero) \ _(prim, AutogradAnyNonZero) \ + _(prim, AutogradAllNonZero) \ + _(prim, AutogradAllZero) \ _(prim, Starred) \ _(prim, TupleConstruct) \ _(prim, TupleUnpack) \ diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp index ad1fb36da5de..2fc95ae72339 100644 --- a/torch/csrc/jit/passes/specialize_autogradzero.cpp +++ b/torch/csrc/jit/passes/specialize_autogradzero.cpp @@ -117,6 +117,8 @@ struct AutogradZeroSpecializer { WithInsertPoint wip{graph_->block()->param_node()->next()}; Value* none_val = graph_->insertConstant(IValue()); std::vector checks; + std::vector zero_values; + std::vector nonzero_values; for (auto inp : graph_->inputs()) { if (auto profile_optional_node = getUse(inp, prim::profile_optional)) { @@ -146,15 +148,16 @@ struct AutogradZeroSpecializer { } state_[inp] = *pttp->undefined() ? State::Zero : State::Nonzero; - auto check = graph_->insert(prim::AutogradAnyNonZero, {inp}); + if (*pttp->undefined()) { - check = graph_->insert(aten::__not__, {check}); + zero_values.push_back(inp); + } else { + nonzero_values.push_back(inp); } - checks.push_back(check); } // unable to specialize any of the inputs - if (checks.size() == 0) { + if (nonzero_values.size() == 0 && zero_values.size() == 0) { GRAPH_DUMP("Unable to add any specialization guards", graph_); versioning_if->destroy(); // the checks we inserted will be cleaned up @@ -162,6 +165,18 @@ struct AutogradZeroSpecializer { return nullptr; } + Node* nonzero_check = graph_->insert(prim::AutogradAllNonZero, {})->node(); + for (Value* v : nonzero_values) { + nonzero_check->addInput(v); + } + checks.push_back(nonzero_check->output()); + + Node* zero_check = graph_->insert(prim::AutogradAllZero, {})->node(); + for (Value* v : zero_values) { + zero_check->addInput(v); + } + checks.push_back(zero_check->output()); + Value* bool_list = graph_->insertNode(graph_->createList(BoolType::get(), checks)) ->output(); diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp index f12c8186396e..2bd6a2b47ec9 100644 --- a/torch/csrc/jit/runtime/operator.cpp +++ b/torch/csrc/jit/runtime/operator.cpp @@ -224,6 +224,8 @@ bool printerHasSpecialCaseFor(Symbol sym) { c10::onnx::Shape, // only used in onnx prim::AutogradZero, // temporarily inserted by autograd prim::AutogradAnyNonZero, // temporarily inserted by autograd + prim::AutogradAllNonZero, // temporarily inserted by autograd + prim::AutogradAllZero, // temporarily inserted by autograd prim::AutogradAdd, // temporarily inserted by autograd prim::ConstantChunk, // optimization pass adds it prim::DifferentiableGraph, // optimization pass adds it, diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp index 6ad16774789b..98c073668170 100644 --- a/torch/csrc/jit/runtime/profiling_record.cpp +++ b/torch/csrc/jit/runtime/profiling_record.cpp @@ -171,6 +171,8 @@ bool needsProfiledInputs(Node* n) { // specialize_autogradzero case prim::AutogradAdd: case prim::AutogradAnyNonZero: + case prim::AutogradAllNonZero: + case prim::AutogradAllZero: case prim::AutogradZero: // peephole case aten::dim: diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index e3b0fa1e88c3..dc075ce14166 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -442,6 +442,38 @@ RegisterOperators reg( stack->emplace_back(result); }, aliasAnalysisFromSchema()), + Operator( + "prim::AutogradAllZero(...) -> bool", + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + bool result = true; + for (const IValue& v : last(stack, num_inputs)) { + TORCH_INTERNAL_ASSERT(v.isTensor()); + if (v.toTensor().defined()) { + result = false; + break; + } + } + drop(stack, num_inputs); + stack->emplace_back(result); + }, + aliasAnalysisFromSchema()), + Operator( + "prim::AutogradAllNonZero(...) -> bool", + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + bool result = true; + for (const IValue& v : last(stack, num_inputs)) { + TORCH_INTERNAL_ASSERT(v.isTensor()); + if (!v.toTensor().defined()) { + result = false; + break; + } + } + drop(stack, num_inputs); + stack->emplace_back(result); + }, + aliasAnalysisFromSchema()), Operator( "prim::AutogradAdd(Any a, Any b) -> Any", [](Stack* stack) { From cd7a68228280e8497ebb55402c1fd7ce0c905b92 Mon Sep 17 00:00:00 2001 From: Danny Huang Date: Thu, 24 Sep 2020 14:39:58 -0700 Subject: [PATCH 105/449] [caffe2] adds hypothesis test for queue ops cancel (#45178) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45178 ## Motivation * To be able to make C2 ops cancellable so we can safely exit. * Some C2 operators are now blocking thus being non-cancellable. If an error occurs we need to be able to safely stop all net execution so we can throw the exception to the caller. ## Summary * Adds a hypothesis test for queue ops cancellation. Test Plan: ## Unit test added to verify that queue ops propagate errors ``` buck test caffe2/caffe2/python:hypothesis_test buck test caffe2/caffe2/python:hypothesis_test -- test_safe_dequeue_blob__raises_exception_when_hang --stress-runs 1000 ``` ``` Summary Pass: 1000 ListingSuccess: 1 ``` Reviewed By: d4l3k Differential Revision: D23847576 fbshipit-source-id: 2fc351e1ee13ea8b32d976216d2d01dfb6fcc1ad --- caffe2/python/hypothesis_test.py | 56 +++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 8a286383f60f..045677f8422a 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -10,7 +10,7 @@ from hypothesis import assume, given, settings, HealthCheck import hypothesis.strategies as st import unittest -import os +import threading from caffe2.python import core, workspace, tt_core, dyndep import caffe2.python.hypothesis_test_util as hu @@ -2695,6 +2695,60 @@ def histogram(X): self.assertDeviceChecks(dc, op, [X], [0, 1]) self.assertReferenceChecks(gc, op, [X], histogram) + @settings(max_examples=1, deadline=None) + @given( + queue_capacity=st.integers(2, 2), + time_sleep=st.integers(5, 10), + num_blobs_to_equeue=st.integers(1, 1), + num_blobs_to_dequeue=st.integers(2, 2), + ) + def test_safe_dequeue_blob__raises_exception_when_hang( + self, + queue_capacity, + time_sleep, + num_blobs_to_equeue, + num_blobs_to_dequeue, + ): + r""" + Tests SafeDequeueBlobsOp being cancellable. + + Create a queue with the number of BlobsQueue less than the number + SafeDequeueBlobs to cause the hanging behavior when running the Net. + + Then call cancel from the previous sleeping thread to ensure exception + is raised. + """ + + def _net_instance_cancel(net_instance): + time.sleep(time_sleep) + net_instance.cancel() + + init_net = core.Net("init_net") + init_net.Proto().type = "async_scheduling" + + queue = init_net.CreateBlobsQueue( + [], + "queue_name", + capacity=queue_capacity, + num_blobs=num_blobs_to_equeue, + ) + + ws = workspace.Workspace() + ws.create_net(init_net).run() + + net = core.Net("net") + net.Proto().type = "async_scheduling" + + blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue) + + net_instance = ws.create_net(net) + + t = threading.Thread(target=_net_instance_cancel, args=[net_instance]) + t.start() + + with self.assertRaises(Exception): + net_instance.run() + t.join() if __name__ == "__main__": From b84dd771e69c67162716ddd1baa4d9b062531c82 Mon Sep 17 00:00:00 2001 From: vishalrao487 <111801046@smail.iitpkd.ac.in> Date: Thu, 24 Sep 2020 14:40:53 -0700 Subject: [PATCH 106/449] Grammatically updated the tech docs (#45192) Summary: Small grammatical update to the [https://pytorch.org/docs/stable/tensors.html](url) docs. **_update1_** ![update1](https://user-images.githubusercontent.com/62737243/93969792-5c0ea800-fd8a-11ea-8c9f-0033f51a1fdc.png) **_update2_** ![update2](https://user-images.githubusercontent.com/62737243/93969801-603ac580-fd8a-11ea-812d-d3026b9fc8a5.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/45192 Reviewed By: bwasti Differential Revision: D23877870 Pulled By: ezyang fbshipit-source-id: 929ba3d479925b5132dbe87fad2da487408db7c7 --- docs/source/tensors.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index cd1c363604fe..7cd1a88f82b3 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -8,7 +8,7 @@ torch.Tensor A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of a single data type. -Torch defines 10 tensor types with CPU and GPU variants: +Torch defines 10 tensor types with CPU and GPU variants which are as follows: ========================== =========================================== ============================= ================================ Data type dtype CPU tensor GPU tensor @@ -32,7 +32,7 @@ Boolean ``torch.bool`` :class: Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10 significand bits. Useful when precision is important at the expense of range. .. [2] - Sometimes referred to as Brain Floating Point: use 1 sign, 8 exponent and 7 + Sometimes referred to as Brain Floating Point: uses 1 sign, 8 exponent, and 7 significand bits. Useful when range is important, since it has the same number of exponent bits as ``float32`` From 6311c5a483e446abaca2d95c2d58b5f462911e6f Mon Sep 17 00:00:00 2001 From: Ashkan Aliabadi Date: Thu, 24 Sep 2020 15:04:51 -0700 Subject: [PATCH 107/449] Minor touchups. (#44317) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44317 Test Plan: Imported from OSS Reviewed By: IvanKobzarev Differential Revision: D23820828 Pulled By: AshkanAliabadi fbshipit-source-id: b83bdea9aed2fb52bd254ff15914d55a1af58c04 --- aten/src/ATen/native/vulkan/Vulkan.h | 2 +- aten/src/ATen/native/vulkan/api/Allocator.h | 12 +++- aten/src/ATen/native/vulkan/api/Command.cpp | 62 ++++++++++++++++--- aten/src/ATen/native/vulkan/api/Command.h | 2 +- aten/src/ATen/native/vulkan/api/Common.h | 22 +++++-- aten/src/ATen/native/vulkan/api/Context.cpp | 28 +++++++-- aten/src/ATen/native/vulkan/api/Context.h | 6 +- .../src/ATen/native/vulkan/api/Descriptor.cpp | 39 +++++++++++- aten/src/ATen/native/vulkan/api/Descriptor.h | 10 +-- aten/src/ATen/native/vulkan/api/Pipeline.cpp | 59 ++++++++++++++++-- aten/src/ATen/native/vulkan/api/Pipeline.h | 2 +- aten/src/ATen/native/vulkan/api/Resource.cpp | 42 ++++++++++++- aten/src/ATen/native/vulkan/api/Resource.h | 23 +++++-- aten/src/ATen/native/vulkan/api/Shader.cpp | 29 ++++++++- aten/src/ATen/native/vulkan/api/Shader.h | 12 ++-- 15 files changed, 294 insertions(+), 56 deletions(-) diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h index df9a53f7076d..c2b1775e8f0a 100644 --- a/aten/src/ATen/native/vulkan/Vulkan.h +++ b/aten/src/ATen/native/vulkan/Vulkan.h @@ -456,7 +456,7 @@ class ComputeUnit final { void createComputePipelineCompile( const std::string& glslSrc, const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout& descrSetLayout, + const VkDescriptorSetLayout descrSetLayout, const WorkGroupSize workGroupSize); #endif diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h index afa720a515e6..f0f0c9baa59c 100644 --- a/aten/src/ATen/native/vulkan/api/Allocator.h +++ b/aten/src/ATen/native/vulkan/api/Allocator.h @@ -2,11 +2,19 @@ #include +#ifdef DEBUG + #define VMA_DEBUG_LOG(format, ...) \ + do { \ + printf(format, ##__VA_ARGS__); \ + printf("\n"); \ + } while(false) +#endif /* DEBUG */ + #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnullability-completeness" #pragma clang diagnostic ignored "-Wunused-variable" -#endif +#endif /* __clang__ */ // Do NOT include vk_mem_alloc.h directly. // Always include this file (Allocator.h) instead. @@ -15,4 +23,4 @@ #ifdef __clang__ #pragma clang diagnostic pop -#endif +#endif /* __clang__ */ diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp index 21279b408233..48512215c5fc 100644 --- a/aten/src/ATen/native/vulkan/api/Command.cpp +++ b/aten/src/ATen/native/vulkan/api/Command.cpp @@ -7,6 +7,9 @@ namespace api { Command::Pool::Factory::Factory(const VkDevice device) : device_(device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( @@ -20,7 +23,14 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( VkCommandPool command_pool{}; VK_CHECK(vkCreateCommandPool( - device_, &command_pool_create_info, nullptr, &command_pool)); + device_, + &command_pool_create_info, + nullptr, + &command_pool)); + + TORCH_CHECK( + command_pool, + "Invalid Vulkan command pool!"); return Handle{ command_pool, @@ -31,8 +41,13 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( void Command::Pool::purge( const VkDevice device, const VkCommandPool command_pool) { - TORCH_INTERNAL_ASSERT(device, "Invalid Vulkan device!"); - TORCH_INTERNAL_ASSERT(command_pool, "Invalid Vulkan command pool!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_pool, + "Invalid Vulkan command pool!"); VK_CHECK(vkResetCommandPool(device, command_pool, 0u)); } @@ -42,6 +57,14 @@ namespace { VkCommandBuffer allocate_command_buffer( const VkDevice device, const VkCommandPool command_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_pool, + "Invalid Vulkan command pool!"); + const VkCommandBufferAllocateInfo command_buffer_allocate_info{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, nullptr, @@ -52,7 +75,13 @@ VkCommandBuffer allocate_command_buffer( VkCommandBuffer command_buffer{}; VK_CHECK(vkAllocateCommandBuffers( - device, &command_buffer_allocate_info, &command_buffer)); + device, + &command_buffer_allocate_info, + &command_buffer)); + + TORCH_CHECK( + command_buffer, + "Invalid Vulkan command buffer!"); return command_buffer; } @@ -61,6 +90,9 @@ VkCommandBuffer allocate_command_buffer( Command::Buffer::Buffer(const VkDevice device, const VkCommandPool command_pool) : command_buffer_(allocate_command_buffer(device, command_pool)) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_buffer_, + "Invalid Vulkan command buffer!"); } void Command::Buffer::Buffer::begin() { @@ -71,7 +103,9 @@ void Command::Buffer::Buffer::begin() { nullptr, }; - VK_CHECK(vkBeginCommandBuffer(command_buffer_, &command_buffer_begin_info)); + VK_CHECK(vkBeginCommandBuffer( + command_buffer_, + &command_buffer_begin_info)); } void Command::Buffer::Buffer::end() { @@ -79,16 +113,26 @@ void Command::Buffer::Buffer::end() { } void Command::Buffer::bind(const VkPipeline pipeline) { - TORCH_INTERNAL_ASSERT(pipeline, "Invalid Vulkan pipeline!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline, + "Invalid Vulkan pipeline!"); - vkCmdBindPipeline(command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + vkCmdBindPipeline( + command_buffer_, + VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); } void Command::Buffer::bind( const VkPipelineLayout pipeline_layout, const VkDescriptorSet descriptor_set) { - TORCH_INTERNAL_ASSERT(pipeline_layout, "Invalid Vulkan pipeline layout!"); - TORCH_INTERNAL_ASSERT(descriptor_set, "Invalid Vulkan descriptor set!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline_layout, + "Invalid Vulkan pipeline layout!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_set, + "Invalid Vulkan descriptor set!"); vkCmdBindDescriptorSets( command_buffer_, diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h index 462a50fef7fd..554e6fdf373e 100644 --- a/aten/src/ATen/native/vulkan/api/Command.h +++ b/aten/src/ATen/native/vulkan/api/Command.h @@ -9,7 +9,7 @@ namespace native { namespace vulkan { namespace api { -struct C10_EXPORT Command final { +struct Command final { // // Pool // diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h index 0c1e7cc4720b..aec26bf987a0 100644 --- a/aten/src/ATen/native/vulkan/api/Common.h +++ b/aten/src/ATen/native/vulkan/api/Common.h @@ -24,10 +24,10 @@ at::native::vulkan::api::destroy_##Handle #define VK_DELETER_DISPATCHABLE_DECLARE(Handle) \ - C10_EXPORT void destroy_##Handle(const Vk##Handle handle) + void destroy_##Handle(const Vk##Handle handle) #define VK_DELETER_NON_DISPATCHABLE_DECLARE(Handle) \ - class C10_EXPORT destroy_##Handle final { \ + class destroy_##Handle final { \ public: \ explicit destroy_##Handle(const VkDevice device); \ void operator()(const Vk##Handle handle) const; \ @@ -40,6 +40,14 @@ namespace native { namespace vulkan { namespace api { +struct Command; +class Context; +struct Descriptor; +struct Pipeline; +struct Resource; +class Runtime; +struct Shader; + VK_DELETER_DISPATCHABLE_DECLARE(Instance); VK_DELETER_DISPATCHABLE_DECLARE(Device); VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore); @@ -78,11 +86,13 @@ class Handle final { Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; Handle(Handle&&); - Handle& operator=(Handle&&); + Handle& operator=(Handle&&) &; + Handle& operator=(Handle&&) && = delete; ~Handle(); operator bool() const; - Type get() const; + Type get() const &; + Type get() const && = delete; Type release(); void reset(Type payload = kNull); @@ -112,7 +122,7 @@ inline Handle::Handle(Handle&& handle) template inline Handle& -Handle::operator=(Handle&& handle) +Handle::operator=(Handle&& handle) & { reset(handle.release()); deleter_ = std::move(handle.deleter_); @@ -130,7 +140,7 @@ inline Handle::operator bool() const { } template -inline Type Handle::get() const { +inline Type Handle::get() const & { return payload_; } diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp index 76a245e16d38..206967b550b2 100644 --- a/aten/src/ATen/native/vulkan/api/Context.cpp +++ b/aten/src/ATen/native/vulkan/api/Context.cpp @@ -82,7 +82,9 @@ VkInstance create_instance(const bool enable_validation_layers) { instance_extension_count); VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, instance_extension_properties.data())); + nullptr, + &instance_extension_count, + instance_extension_properties.data())); constexpr const char* const requested_instance_extensions[]{ VK_EXT_DEBUG_REPORT_EXTENSION_NAME, @@ -121,6 +123,7 @@ VkInstance create_instance(const bool enable_validation_layers) { VkInstance instance{}; VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); + TORCH_CHECK(instance, "Invalid Vulkan instance!"); return instance; } @@ -159,13 +162,20 @@ VkDebugReportCallbackEXT create_debug_report_callback( nullptr, &debug_report_callback)); + TORCH_CHECK( + debug_report_callback, + "Invalid Vulkan debug report callback!"); + return debug_report_callback; } VkPhysicalDevice acquire_physical_device(const VkInstance instance) { uint32_t device_count = 0; VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); - TORCH_CHECK(device_count > 0, "Vulkan: Could not find a device with Vulkan support!"); + + TORCH_CHECK( + device_count > 0, + "Vulkan: Could not find a device with Vulkan support!"); std::vector devices(device_count); VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); @@ -187,13 +197,16 @@ uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device physical_device, &queue_family_count, nullptr); TORCH_CHECK( - queue_family_count > 0, "Vulkan: Invalid number of queue families!"); + queue_family_count > 0, + "Vulkan: Invalid number of queue families!"); std::vector queue_families_properties( queue_family_count); vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, &queue_family_count, queue_families_properties.data()); + physical_device, + &queue_family_count, + queue_families_properties.data()); for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { const VkQueueFamilyProperties& properties = queue_families_properties[i]; @@ -234,6 +247,7 @@ VkDevice create_device( VkDevice device{}; VK_CHECK(vkCreateDevice(physical_device, &device_create_info, nullptr, &device)); + TORCH_CHECK(device, "Invalid Vulkan device!"); return device; } @@ -243,6 +257,8 @@ VkQueue acquire_queue( const uint32_t compute_queue_family_index) { VkQueue queue{}; vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue); + TORCH_CHECK(queue, "Invalid Vulkan queue!"); + return queue; } @@ -309,11 +325,11 @@ bool available() { return initialize(); } -Context& context() { +Context* context() { Context* const context = initialize(); TORCH_CHECK(context, "Vulkan: Backend not available on this platform!"); - return *context; + return context; } } // namespace api diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h index d57eab66108e..7cec6ada5d5e 100644 --- a/aten/src/ATen/native/vulkan/api/Context.h +++ b/aten/src/ATen/native/vulkan/api/Context.h @@ -19,7 +19,7 @@ namespace api { // user. // -class C10_EXPORT Context final { +class Context final { public: explicit Context(bool enable_validation_layers); ~Context() = default; @@ -90,8 +90,8 @@ class C10_EXPORT Context final { Resource resource_; }; -C10_EXPORT bool available(); -C10_EXPORT Context& context(); +bool available(); +Context* context(); } // namespace api } // namespace vulkan diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp index 1b5ea94341a3..bab10466ea02 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp +++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp @@ -46,6 +46,9 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{ Descriptor::Pool::Factory::Factory(const VkDevice device) : device_(device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); } typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()( @@ -61,7 +64,14 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator() VkDescriptorPool descriptor_pool{}; VK_CHECK(vkCreateDescriptorPool( - device_, &descriptor_pool_create_info, nullptr, &descriptor_pool)); + device_, + &descriptor_pool_create_info, + nullptr, + &descriptor_pool)); + + TORCH_CHECK( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); return Handle{ descriptor_pool, @@ -72,12 +82,29 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator() void Descriptor::Pool::purge( const VkDevice device, const VkDescriptorPool descriptor_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); + VK_CHECK(vkResetDescriptorPool(device, descriptor_pool, 0u)); } -Descriptor::Factory::Factory(const VkDevice device, const VkDescriptorPool descriptor_pool) +Descriptor::Factory::Factory( + const VkDevice device, + const VkDescriptorPool descriptor_pool) : device_(device), descriptor_pool_(descriptor_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); } VkDescriptorSet Descriptor::Factory::allocate( @@ -92,7 +119,13 @@ VkDescriptorSet Descriptor::Factory::allocate( VkDescriptorSet descriptor_set{}; VK_CHECK(vkAllocateDescriptorSets( - device_, &descriptor_set_allocate_info, &descriptor_set)); + device_, + &descriptor_set_allocate_info, + &descriptor_set)); + + TORCH_CHECK( + descriptor_set, + "Invalid Vulkan descriptor set!"); return descriptor_set; } diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h index 3e339ae4641f..da4a2a03e2f9 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.h +++ b/aten/src/ATen/native/vulkan/api/Descriptor.h @@ -49,7 +49,7 @@ namespace api { // as well. This behavior is by design. // -struct C10_EXPORT Descriptor final { +struct Descriptor final { // // Pool // @@ -156,8 +156,8 @@ inline size_t Descriptor::Pool::Factory::Hasher::operator()( } // namespace at inline bool operator==( - const VkDescriptorPoolSize& descriptor_pool_size_1, - const VkDescriptorPoolSize& descriptor_pool_size_2) { - return (descriptor_pool_size_1.type == descriptor_pool_size_2.type) && - (descriptor_pool_size_1.descriptorCount == descriptor_pool_size_2.descriptorCount); + const VkDescriptorPoolSize& _1, + const VkDescriptorPoolSize& _2) { + return (_1.type == _2.type) && + (_1.descriptorCount == _2.descriptorCount); } diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp index 303eea7cb401..3c845c5fae32 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp +++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp @@ -7,10 +7,17 @@ namespace api { Pipeline::Layout::Factory::Factory(const VkDevice device) : device_(device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()( const Descriptor& descriptor) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.descriptor_set_layout, + "Invalid Vulkan descriptor set layout!"); + const VkPipelineLayoutCreateInfo pipeline_layout_create_info{ VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, nullptr, @@ -23,7 +30,14 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator() VkPipelineLayout pipeline_layout{}; VK_CHECK(vkCreatePipelineLayout( - device_, &pipeline_layout_create_info, nullptr, &pipeline_layout)); + device_, + &pipeline_layout_create_info, + nullptr, + &pipeline_layout)); + + TORCH_CHECK( + pipeline_layout, + "Invalid Vulkan pipeline layout!"); return Handle{ pipeline_layout, @@ -34,6 +48,10 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator() namespace { VkPipelineCache create_pipeline_cache(const VkDevice device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + const VkPipelineCacheCreateInfo pipeline_cache_create_info{ VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, nullptr, @@ -44,7 +62,14 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) { VkPipelineCache pipeline_cache{}; VK_CHECK(vkCreatePipelineCache( - device, &pipeline_cache_create_info, nullptr, &pipeline_cache)); + device, + &pipeline_cache_create_info, + nullptr, + &pipeline_cache)); + + TORCH_CHECK( + pipeline_cache, + "Invalid Vulkan pipeline cache!"); return pipeline_cache; } @@ -53,11 +78,28 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) { Pipeline::Factory::Factory(const VkDevice device) : device_(device), - pipeline_cache_(create_pipeline_cache(device), VK_DELETER(PipelineCache)(device)) { + pipeline_cache_( + create_pipeline_cache(device), + VK_DELETER(PipelineCache)(device)) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline_cache_, + "Invalid Vulkan pipeline cache!"); } typename Pipeline::Factory::Handle Pipeline::Factory::operator()( const Descriptor& descriptor) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.pipeline_layout, + "Invalid Vulkan pipeline layout!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.shader_module, + "Invalid Vulkan shader module!"); + constexpr uint32_t x_offset = 0u; constexpr uint32_t x_size = sizeof(Shader::WorkGroup::x); constexpr uint32_t y_offset = x_offset + x_size; @@ -113,7 +155,16 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()( VkPipeline pipeline{}; VK_CHECK(vkCreateComputePipelines( - device_, pipeline_cache_.get(), 1u, &compute_pipeline_create_info, nullptr, &pipeline)); + device_, + pipeline_cache_.get(), + 1u, + &compute_pipeline_create_info, + nullptr, + &pipeline)); + + TORCH_CHECK( + pipeline, + "Invalid Vulkan pipeline!"); return Handle{ pipeline, diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h index a5d72324c36e..0ecef40c8b19 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.h +++ b/aten/src/ATen/native/vulkan/api/Pipeline.h @@ -29,7 +29,7 @@ namespace api { // these Vulkan objects. // -struct C10_EXPORT Pipeline final { +struct Pipeline final { // // Layout // diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp index c538a1b6e2d0..7163294bd1d9 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.cpp +++ b/aten/src/ATen/native/vulkan/api/Resource.cpp @@ -10,6 +10,18 @@ VmaAllocator create_allocator( const VkInstance instance, const VkPhysicalDevice physical_device, const VkDevice device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + const VmaAllocatorCreateInfo allocator_create_info{ 0u, physical_device, @@ -27,6 +39,7 @@ VmaAllocator create_allocator( VmaAllocator allocator{}; VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator)); + TORCH_CHECK(allocator, "Invalid VMA allocator!"); return allocator; } @@ -87,6 +100,13 @@ Resource::Memory::Scope::Scope( : allocator_(allocator), allocation_(allocation), access_(access) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + allocator, + "Invalid VMA allocator!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + allocation, + "Invalid VMA allocation!"); } void Resource::Memory::Scope::operator()(const void* const data) const { @@ -109,7 +129,12 @@ Resource::Pool::Pool( const VkPhysicalDevice physical_device, const VkDevice device) : device_(device), - allocator_(create_allocator(instance, physical_device, device), vmaDestroyAllocator) { + allocator_( + create_allocator( + instance, + physical_device, + device), + vmaDestroyAllocator) { buffers_.reserve(Configuration::kReserve); images_.reserve(Configuration::kReserve); } @@ -141,6 +166,9 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) &allocation, &allocation_info)); + TORCH_CHECK(buffer, "Invalid Vulkan buffer!"); + TORCH_CHECK(allocation, "Invalid VMA allocation!"); + buffers_.emplace_back( Buffer{ buffer, @@ -189,6 +217,9 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { &allocation, &allocation_info)); + TORCH_CHECK(image, "Invalid Vulkan image!"); + TORCH_CHECK(allocation, "Invalid VMA allocation!"); + const VkImageViewCreateInfo image_view_create_info{ VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, nullptr, @@ -213,7 +244,14 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { VkImageView view{}; VK_CHECK(vkCreateImageView( - device_, &image_view_create_info, nullptr, &view)) + device_, + &image_view_create_info, + nullptr, + &view)); + + TORCH_CHECK( + view, + "Invalid Vulkan image view!"); images_.emplace_back( Image{ diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h index 04cd9a067663..a74a3c2d3c89 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.h +++ b/aten/src/ATen/native/vulkan/api/Resource.h @@ -8,7 +8,7 @@ namespace native { namespace vulkan { namespace api { -struct C10_EXPORT Resource final { +struct Resource final { /* Memory */ @@ -25,12 +25,25 @@ struct C10_EXPORT Resource final { template< typename Type, typename Pointer = std::add_pointer_t>> - Data map() const; + Data map() const &; template< typename Type, typename Pointer = std::add_pointer_t> - Data map(); + Data map() &; + + private: + // Intentionally disabed to ensure memory access is always properly + // encapsualted in a scoped map-unmap region. Allowing below overloads + // to be invoked on a temporary would open the door to the possibility + // of accessing the underlying memory out of the expected scope making + // for seemingly ineffective memory writes and hard to hunt down bugs. + + template + Data map() const && = delete; + + template + Data map() && = delete; }; /* @@ -144,7 +157,7 @@ class Resource::Memory::Scope final { }; template -inline Resource::Memory::Data Resource::Memory::map() const { +inline Resource::Memory::Data Resource::Memory::map() const & { void* map(const Memory& memory); return Data{ @@ -154,7 +167,7 @@ inline Resource::Memory::Data Resource::Memory::map() const { } template -inline Resource::Memory::Data Resource::Memory::map() { +inline Resource::Memory::Data Resource::Memory::map() & { void* map(const Memory& memory); return Data{ diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp index bbd3e3464d78..4cde24a2eef9 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.cpp +++ b/aten/src/ATen/native/vulkan/api/Shader.cpp @@ -11,6 +11,9 @@ namespace api { Shader::Layout::Factory::Factory(const VkDevice device) : device_(device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( @@ -25,7 +28,14 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( VkDescriptorSetLayout descriptor_set_layout{}; VK_CHECK(vkCreateDescriptorSetLayout( - device_, &descriptor_set_layout_create_info, nullptr, &descriptor_set_layout)); + device_, + &descriptor_set_layout_create_info, + nullptr, + &descriptor_set_layout)); + + TORCH_CHECK( + descriptor_set_layout, + "Invalid Vulkan descriptor set layout!"); return Handle{ descriptor_set_layout, @@ -35,6 +45,8 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( Shader::Descriptor::Descriptor(const char* const glsl) : type(Type::Source) { + TORCH_CHECK(glsl, "Invalid shader source code!"); + shader.source = { glsl, 0u, @@ -43,6 +55,8 @@ Shader::Descriptor::Descriptor(const char* const glsl) Shader::Descriptor::Descriptor(const uint32_t* const code, const uint32_t size) : type(Type::Binary) { + TORCH_CHECK(code && (0u != size), "Invalid shader binary!"); + shader.binary = { code, size, @@ -68,6 +82,10 @@ struct Shader::Factory::Compiler final { } std::vector compile(const char* const source) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + source, + "Invalid shader source code!"); + const shaderc::SpvCompilationResult result = context.CompileGlslToSpv( source, ::strlen(source), @@ -139,7 +157,14 @@ typename Shader::Factory::Handle Shader::Factory::operator()( VkShaderModule shader_module{}; VK_CHECK(vkCreateShaderModule( - device_, &shader_module_create_info, nullptr, &shader_module)); + device_, + &shader_module_create_info, + nullptr, + &shader_module)); + + TORCH_CHECK( + shader_module, + "Invalid Vulkan shader module!"); return Handle{ shader_module, diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h index 0fd2fa01614b..4a0080cb888d 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.h +++ b/aten/src/ATen/native/vulkan/api/Shader.h @@ -32,7 +32,7 @@ namespace api { // and destruct the aforementioned Vulkan objects. // -struct C10_EXPORT Shader final { +struct Shader final { // // Layout // @@ -187,11 +187,11 @@ inline size_t Shader::Layout::Factory::Hasher::operator()( } inline bool operator==( - const Shader::WorkGroup& work_group_1, - const Shader::WorkGroup& work_group_2) { - return (work_group_1.x == work_group_2.x) && - (work_group_1.y == work_group_2.y) && - (work_group_1.z == work_group_2.z); + const Shader::WorkGroup& _1, + const Shader::WorkGroup& _2) { + return (_1.x == _2.x) && + (_1.y == _2.y) && + (_1.z == _2.z); } inline bool operator==( From 5a59330647d1c461dc49dc84b5ff5f18e8a192d8 Mon Sep 17 00:00:00 2001 From: Ashkan Aliabadi Date: Thu, 24 Sep 2020 15:04:51 -0700 Subject: [PATCH 108/449] Add architectural support for multi-GPU. (#44059) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44059 Test Plan: Imported from OSS Reviewed By: IvanKobzarev Differential Revision: D23820825 Pulled By: AshkanAliabadi fbshipit-source-id: 0719b00581487a77ebadff867d1e4ac89354bf90 --- aten/src/ATen/native/vulkan/api/Adapter.h | 36 ++ aten/src/ATen/native/vulkan/api/Command.cpp | 6 +- aten/src/ATen/native/vulkan/api/Command.h | 10 +- aten/src/ATen/native/vulkan/api/Common.h | 7 + aten/src/ATen/native/vulkan/api/Context.cpp | 300 ++------------- aten/src/ATen/native/vulkan/api/Context.h | 68 ++-- .../src/ATen/native/vulkan/api/Descriptor.cpp | 6 +- aten/src/ATen/native/vulkan/api/Descriptor.h | 12 +- aten/src/ATen/native/vulkan/api/Pipeline.cpp | 12 +- aten/src/ATen/native/vulkan/api/Pipeline.h | 14 +- aten/src/ATen/native/vulkan/api/Resource.cpp | 22 +- aten/src/ATen/native/vulkan/api/Resource.h | 12 +- aten/src/ATen/native/vulkan/api/Runtime.cpp | 343 ++++++++++++++++++ aten/src/ATen/native/vulkan/api/Runtime.h | 64 ++++ aten/src/ATen/native/vulkan/api/Shader.cpp | 15 +- aten/src/ATen/native/vulkan/api/Shader.h | 14 +- aten/src/ATen/native/vulkan/api/api.h | 2 + aten/src/ATen/test/vulkan_api_test.cpp | 5 - 18 files changed, 582 insertions(+), 366 deletions(-) create mode 100644 aten/src/ATen/native/vulkan/api/Adapter.h create mode 100644 aten/src/ATen/native/vulkan/api/Runtime.cpp create mode 100644 aten/src/ATen/native/vulkan/api/Runtime.h diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h new file mode 100644 index 000000000000..239edfb74518 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Adapter.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +// +// A Vulkan Adapter represents a physical device and its properties. Adapters +// are enumerated through the Runtime and are used in creation of Contexts. +// Each tensor in PyTorch is associated with a Context to make the +// device <-> tensor affinity explicit. +// + +struct Adapter final { + Runtime* runtime; + VkPhysicalDevice handle; + VkPhysicalDeviceProperties properties; + VkPhysicalDeviceMemoryProperties memory_properties; + uint32_t compute_queue_family_index; + + inline bool has_unified_memory() const { + // Ideally iterate over all memory types to see if there is a pool that + // is both host-visible, and device-local. This should be a good proxy + // for now. + return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType; + } +}; + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp index 48512215c5fc..a7793aea16dc 100644 --- a/aten/src/ATen/native/vulkan/api/Command.cpp +++ b/aten/src/ATen/native/vulkan/api/Command.cpp @@ -5,15 +5,15 @@ namespace native { namespace vulkan { namespace api { -Command::Pool::Factory::Factory(const VkDevice device) - : device_(device) { +Command::Pool::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_, "Invalid Vulkan device!"); } typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( - const Descriptor& descriptor) const { + const Descriptor& descriptor) const { const VkCommandPoolCreateInfo command_pool_create_info{ VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h index 554e6fdf373e..b0c171faa490 100644 --- a/aten/src/ATen/native/vulkan/api/Command.h +++ b/aten/src/ATen/native/vulkan/api/Command.h @@ -29,7 +29,7 @@ struct Command final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pool::Descriptor Descriptor; typedef VK_DELETER(CommandPool) Deleter; @@ -52,8 +52,8 @@ struct Command final { typedef api::Cache Cache; Cache cache; - explicit Pool(const VkDevice device) - : cache(Factory(device)) { + explicit Pool(const GPU& gpu) + : cache(Factory(gpu)) { } static void purge(VkDevice device, VkCommandPool command_pool); @@ -78,8 +78,8 @@ struct Command final { VkCommandBuffer command_buffer_; }; - explicit Command(const VkDevice device) - : pool(device) { + explicit Command(const GPU& gpu) + : pool(gpu) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h index aec26bf987a0..cbd53e8045ef 100644 --- a/aten/src/ATen/native/vulkan/api/Common.h +++ b/aten/src/ATen/native/vulkan/api/Common.h @@ -40,6 +40,7 @@ namespace native { namespace vulkan { namespace api { +struct Adapter; struct Command; class Context; struct Descriptor; @@ -48,6 +49,12 @@ struct Resource; class Runtime; struct Shader; +struct GPU final { + const Adapter* adapter; + VkDevice device; + VkQueue queue; +}; + VK_DELETER_DISPATCHABLE_DECLARE(Instance); VK_DELETER_DISPATCHABLE_DECLARE(Device); VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore); diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp index 206967b550b2..d0fa08dbde1d 100644 --- a/aten/src/ATen/native/vulkan/api/Context.cpp +++ b/aten/src/ATen/native/vulkan/api/Context.cpp @@ -8,221 +8,31 @@ namespace vulkan { namespace api { namespace { -struct Configuration final { -#ifndef DEBUG - static constexpr bool kEnableValidationLayers = false; -#else - static constexpr bool kEnableValidationLayers = true; -#endif -}; - -VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( - const VkDebugReportFlagsEXT flags, - const VkDebugReportObjectTypeEXT /* object_type */, - const uint64_t /* object */, - const size_t /* location */, - const int32_t message_code, - const char* const layer_prefix, - const char* const message, - void* const /* user_data */) { - std::stringstream stream; - stream << layer_prefix << " " << message_code << " " << message << std::endl; - const std::string log = stream.str(); - - if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) { - LOG(ERROR) << log; - } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) { - LOG(WARNING) << log; - } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) { - LOG(WARNING) << "Performance:" << log; - } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { - LOG(INFO) << log; - } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { - LOG(INFO) << "Debug: " << log; - } - - return VK_FALSE; -} - -VkInstance create_instance(const bool enable_validation_layers) { - std::vector enabled_instance_layers; - std::vector enabled_instance_extensions; - - if (enable_validation_layers) { - uint32_t instance_layers_count = 0; - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, nullptr)); - - std::vector instance_layer_properties( - instance_layers_count); - - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, - instance_layer_properties.data())); - - constexpr const char* const requested_instance_layers[]{ - // "VK_LAYER_LUNARG_api_dump", - "VK_LAYER_KHRONOS_validation", - }; - - for (const auto& requested_instance_layer : requested_instance_layers) { - for (const auto& layer : instance_layer_properties) { - if (strcmp(requested_instance_layer, layer.layerName) == 0) { - enabled_instance_layers.push_back(requested_instance_layer); - break; - } - } - } - - uint32_t instance_extension_count = 0; - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, nullptr)); - - std::vector instance_extension_properties( - instance_extension_count); - - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, - &instance_extension_count, - instance_extension_properties.data())); - - constexpr const char* const requested_instance_extensions[]{ - VK_EXT_DEBUG_REPORT_EXTENSION_NAME, - }; +Context* initialize() { + static const std::unique_ptr context([]() -> Context* { + try { + const Adapter adapter = runtime()->select([](const Adapter& adapter) { + // Select the first adapter. + return true; + }); - for (const auto& requested_instance_extension : requested_instance_extensions) { - for (const auto& extension : instance_extension_properties) { - if (strcmp(requested_instance_extension, extension.extensionName) == 0) { - enabled_instance_extensions.push_back(requested_instance_extension); - break; - } - } + return new Context(adapter); } - } - - constexpr VkApplicationInfo application_info{ - VK_STRUCTURE_TYPE_APPLICATION_INFO, - nullptr, - "PyTorch", - 0, - "PyTorch", - 0, - VK_API_VERSION_1_0, - }; - - const VkInstanceCreateInfo instance_create_info{ - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, - nullptr, - 0u, - &application_info, - static_cast(enabled_instance_layers.size()), - enabled_instance_layers.data(), - static_cast(enabled_instance_extensions.size()), - enabled_instance_extensions.data(), - }; - - VkInstance instance{}; - VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); - TORCH_CHECK(instance, "Invalid Vulkan instance!"); - - return instance; -} - -VkDebugReportCallbackEXT create_debug_report_callback( - const VkInstance instance, - const bool enable_validation_layers) { - if (!enable_validation_layers) { - return VkDebugReportCallbackEXT{}; - } - - const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ - VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, - nullptr, - VK_DEBUG_REPORT_INFORMATION_BIT_EXT | - VK_DEBUG_REPORT_WARNING_BIT_EXT | - VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | - VK_DEBUG_REPORT_ERROR_BIT_EXT | - VK_DEBUG_REPORT_DEBUG_BIT_EXT, - debug_report_callback_fn, - nullptr, - }; - - const auto vkCreateDebugReportCallbackEXT = - (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance, "vkCreateDebugReportCallbackEXT"); - - TORCH_CHECK( - vkCreateDebugReportCallbackEXT, - "Could not load vkCreateDebugReportCallbackEXT"); - - VkDebugReportCallbackEXT debug_report_callback{}; - VK_CHECK(vkCreateDebugReportCallbackEXT( - instance, - &debugReportCallbackCreateInfo, - nullptr, - &debug_report_callback)); - - TORCH_CHECK( - debug_report_callback, - "Invalid Vulkan debug report callback!"); - - return debug_report_callback; -} - -VkPhysicalDevice acquire_physical_device(const VkInstance instance) { - uint32_t device_count = 0; - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); - - TORCH_CHECK( - device_count > 0, - "Vulkan: Could not find a device with Vulkan support!"); - - std::vector devices(device_count); - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); - - return devices[0]; -} - -VkPhysicalDeviceLimits query_physical_device_physical_device_limits( - const VkPhysicalDevice physical_device) { - VkPhysicalDeviceProperties physical_device_properties{}; - vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties); - return physical_device_properties.limits; -} - -uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) { - uint32_t queue_family_count = 0; - - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, &queue_family_count, nullptr); - - TORCH_CHECK( - queue_family_count > 0, - "Vulkan: Invalid number of queue families!"); - - std::vector queue_families_properties( - queue_family_count); - - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, - &queue_family_count, - queue_families_properties.data()); - - for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { - const VkQueueFamilyProperties& properties = queue_families_properties[i]; - if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { - return i; + catch (...) { + return nullptr; } - } + }()); - TORCH_CHECK( - false, - "Vulkan: Could not find a queue family that supports compute operations!"); + return context.get(); } VkDevice create_device( const VkPhysicalDevice physical_device, const uint32_t compute_queue_family_index) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + const float queue_priorities = 1.0f; const VkDeviceQueueCreateInfo device_queue_create_info{ VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, @@ -255,6 +65,10 @@ VkDevice create_device( VkQueue acquire_queue( const VkDevice device, const uint32_t compute_queue_family_index) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + VkQueue queue{}; vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue); TORCH_CHECK(queue, "Invalid Vulkan queue!"); @@ -264,65 +78,25 @@ VkQueue acquire_queue( } // namespace -Context::Context(const bool enable_validation_layers) - : instance_(create_instance(enable_validation_layers), &VK_DELETER(Instance)), - debug_report_callback_( - create_debug_report_callback(instance(), enable_validation_layers), - Debug(instance())), - physical_device_(acquire_physical_device(instance())), - physical_device_limits_(query_physical_device_physical_device_limits(physical_device())), - compute_queue_family_index_(query_compute_queue_family_index(physical_device())), - device_(create_device(physical_device(), compute_queue_family_index_), &VK_DELETER(Device)), - queue_(acquire_queue(device(), compute_queue_family_index_)), - command_(device()), - shader_(device()), - pipeline_(device()), - descriptor_(device()), - resource_(instance(), physical_device(), device()) { -} - -Context::Debug::Debug(const VkInstance instance) - : instance_(instance) { -} - -void Context::Debug::operator()( - const VkDebugReportCallbackEXT debug_report_callback) const { - if (debug_report_callback) { - const auto vkDestroyDebugReportCallbackEXT = - (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance_, "vkDestroyDebugReportCallbackEXT"); - - TORCH_CHECK( - vkDestroyDebugReportCallbackEXT, - "Could not load vkDestroyDebugReportCallbackEXT"); - - vkDestroyDebugReportCallbackEXT( - instance_, debug_report_callback, nullptr); - } -} - -Context* initialize() { - static const std::unique_ptr context([]() -> Context* { -#ifdef USE_VULKAN_WRAPPER - if (!InitVulkan()) { - TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan"); - return nullptr; - } -#endif - - try { - return new Context(Configuration::kEnableValidationLayers); - } - catch (...) { - return nullptr; - } - }()); - - return context.get(); +void Context::Deleter::operator()(const VkDevice device) const { + // No VK_CHECK. Don't want an exception thrown in the destructor. + vkDeviceWaitIdle(device); + vkDestroyDevice(device, nullptr); } -bool available() { - return initialize(); +Context::Context(const Adapter& adapter) + : adapter_(adapter), + device_( + create_device( + adapter.handle, + adapter.compute_queue_family_index), + Deleter{}), + queue_(acquire_queue(device(), adapter.compute_queue_family_index)), + command_(gpu()), + shader_(gpu()), + pipeline_(gpu()), + descriptor_(gpu()), + resource_(gpu()) { } Context* context() { diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h index 7cec6ada5d5e..5d593bdd9bc1 100644 --- a/aten/src/ATen/native/vulkan/api/Context.h +++ b/aten/src/ATen/native/vulkan/api/Context.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -14,34 +15,29 @@ namespace api { // // Vulkan Context holds onto all relevant Vulkan state as it pertains to our -// use of Vulkan in PyTorch. The context is currently a global object, but -// technically it does not need to be if we were to make it explicit to the -// user. +// use of Vulkan in PyTorch. A Context is associated with one, and only one, +// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch +// are associated with a Context to make tensor <-> device affinity explicit. +// The context is currently a global object, but technically it does not need +// to be if we were to make it explicit to the user. // class Context final { public: - explicit Context(bool enable_validation_layers); + explicit Context(const Adapter& adapter); + Context(const Context&) = delete; + Context(Context&&) = default; + Context& operator=(const Context&) = delete; + Context& operator=(Context&&) = default; ~Context() = default; - inline VkInstance instance() const { - return instance_.get(); - } - - inline VkPhysicalDevice physical_device() const { - return physical_device_; - } - - inline const VkPhysicalDeviceLimits& physical_device_limits() const { - return physical_device_limits_; - } - - inline VkDevice device() const { - return device_.get(); - } - - inline VkQueue queue() const { - return queue_; + inline GPU gpu() { + // A GPU is simply a (physical device, logical device, device queue) trio. + return { + &adapter_, + device(), + queue(), + }; } inline Command& command() { @@ -65,23 +61,26 @@ class Context final { } private: - class Debug final { - public: - explicit Debug(VkInstance instance); - void operator()(VkDebugReportCallbackEXT debug_report_callback) const; + inline VkDevice device() { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_); + return device_.get(); + } - private: - VkInstance instance_; + inline VkQueue queue() { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_); + return queue_; + } + + private: + class Deleter final { + public: + void operator()(VkDevice device) const; }; private: // Construction and destruction order matters. Do not move members around. - Handle instance_; - Handle debug_report_callback_; - VkPhysicalDevice physical_device_; - VkPhysicalDeviceLimits physical_device_limits_; - uint32_t compute_queue_family_index_; - Handle device_; + Adapter adapter_; + Handle device_; VkQueue queue_; Command command_; Shader shader_; @@ -90,7 +89,6 @@ class Context final { Resource resource_; }; -bool available(); Context* context(); } // namespace api diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp index bab10466ea02..ff0505ccebca 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp +++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp @@ -44,15 +44,15 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{ }, }; -Descriptor::Pool::Factory::Factory(const VkDevice device) - : device_(device) { +Descriptor::Pool::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device, "Invalid Vulkan device!"); } typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()( - const Descriptor& descriptor) const { + const Descriptor& descriptor) const { const VkDescriptorPoolCreateInfo descriptor_pool_create_info{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr, diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h index da4a2a03e2f9..bc6c14723990 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.h +++ b/aten/src/ATen/native/vulkan/api/Descriptor.h @@ -72,7 +72,7 @@ struct Descriptor final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pool::Descriptor Descriptor; typedef VK_DELETER(DescriptorPool) Deleter; @@ -95,8 +95,8 @@ struct Descriptor final { typedef api::Cache Cache; Cache cache; - explicit Pool(const VkDevice device) - : cache(Factory(device)) { + explicit Pool(const GPU& gpu) + : cache(Factory(gpu)) { } static void purge(VkDevice device, VkDescriptorPool descriptor_pool); @@ -118,9 +118,9 @@ struct Descriptor final { VkDescriptorPool descriptor_pool_; } factory; - explicit Descriptor(const VkDevice device) - : pool(device), - factory(device, pool.cache.retrieve(Pool::kDefault)) { + explicit Descriptor(const GPU& gpu) + : pool(gpu), + factory(gpu.device, pool.cache.retrieve(Pool::kDefault)) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp index 3c845c5fae32..bd9881c05443 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp +++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp @@ -5,8 +5,8 @@ namespace native { namespace vulkan { namespace api { -Pipeline::Layout::Factory::Factory(const VkDevice device) - : device_(device) { +Pipeline::Layout::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_, "Invalid Vulkan device!"); @@ -76,11 +76,11 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) { } // namespace -Pipeline::Factory::Factory(const VkDevice device) - : device_(device), +Pipeline::Factory::Factory(const GPU& gpu) + : device_(gpu.device), pipeline_cache_( - create_pipeline_cache(device), - VK_DELETER(PipelineCache)(device)) { + create_pipeline_cache(device_), + VK_DELETER(PipelineCache)(device_)) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_, "Invalid Vulkan device!"); diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h index 0ecef40c8b19..c327a140eded 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.h +++ b/aten/src/ATen/native/vulkan/api/Pipeline.h @@ -49,7 +49,7 @@ struct Pipeline final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Layout::Descriptor Descriptor; typedef VK_DELETER(PipelineLayout) Deleter; @@ -72,8 +72,8 @@ struct Pipeline final { typedef api::Cache Cache; Cache cache; - explicit Layout(const VkDevice device) - : cache(Factory(device)) { + explicit Layout(const GPU& gpu) + : cache(Factory(gpu)) { } } layout; @@ -93,7 +93,7 @@ struct Pipeline final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pipeline::Descriptor Descriptor; typedef VK_DELETER(Pipeline) Deleter; @@ -117,9 +117,9 @@ struct Pipeline final { typedef api::Cache Cache; Cache cache; - explicit Pipeline(const VkDevice device) - : layout(device), - cache(Factory(device)) { + explicit Pipeline(const GPU& gpu) + : layout(gpu), + cache(Factory(gpu)) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp index 7163294bd1d9..6969883cb183 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.cpp +++ b/aten/src/ATen/native/vulkan/api/Resource.cpp @@ -1,4 +1,5 @@ #include +#include namespace at { namespace native { @@ -59,6 +60,7 @@ VmaAllocationCreateInfo create_allocation_create_info( } void release_buffer(const Resource::Buffer& buffer) { + // Safe to pass null as buffer or allocation. vmaDestroyBuffer( buffer.memory.allocator, buffer.handle, @@ -72,6 +74,7 @@ void release_image(const Resource::Image& image) { vkDestroyImageView(allocator_info.device, image.view, nullptr); } + // Safe to pass null as image or allocation. vmaDestroyImage( image.memory.allocator, image.handle, @@ -124,22 +127,20 @@ void Resource::Memory::Scope::operator()(const void* const data) const { } } -Resource::Pool::Pool( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice device) - : device_(device), +Resource::Pool::Pool(const GPU& gpu) + : device_(gpu.device), allocator_( create_allocator( - instance, - physical_device, - device), + gpu.adapter->runtime->instance(), + gpu.adapter->handle, + device_), vmaDestroyAllocator) { buffers_.reserve(Configuration::kReserve); images_.reserve(Configuration::kReserve); } -Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) { +Resource::Buffer Resource::Pool::allocate( + const Buffer::Descriptor& descriptor) { const VkBufferCreateInfo buffer_create_info{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, @@ -183,7 +184,8 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) return buffers_.back().get(); } -Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { +Resource::Image Resource::Pool::allocate( + const Image::Descriptor& descriptor) { const VkImageCreateInfo image_create_info{ VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, nullptr, diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h index a74a3c2d3c89..00145ebe071f 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.h +++ b/aten/src/ATen/native/vulkan/api/Resource.h @@ -108,10 +108,7 @@ struct Resource final { class Pool final { public: - Pool( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice device); + explicit Pool(const GPU& gpu); Buffer allocate(const Buffer::Descriptor& descriptor); Image allocate(const Image::Descriptor& descriptor); @@ -128,11 +125,8 @@ struct Resource final { std::vector> images_; } pool; - Resource( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice device) - : pool(instance, physical_device, device) { + explicit Resource(const GPU& gpu) + : pool(gpu) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp new file mode 100644 index 000000000000..ce6e3b4231e4 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp @@ -0,0 +1,343 @@ +#include +#include + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { +namespace { + +struct Configuration final { +#ifndef DEBUG + static constexpr Runtime::Type kRuntime = Runtime::Type::Debug; +#else + static constexpr Runtime::Type kRuntime = Runtime::Type::Release; +#endif +}; + +VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( + const VkDebugReportFlagsEXT flags, + const VkDebugReportObjectTypeEXT /* object_type */, + const uint64_t /* object */, + const size_t /* location */, + const int32_t message_code, + const char* const layer_prefix, + const char* const message, + void* const /* user_data */) { + std::stringstream stream; + stream << layer_prefix << " " << message_code << " " << message << std::endl; + const std::string log = stream.str(); + + if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) { + LOG(ERROR) << log; + } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) { + LOG(WARNING) << log; + } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) { + LOG(WARNING) << "Performance:" << log; + } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { + LOG(INFO) << log; + } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { + LOG(INFO) << "Debug: " << log; + } + + return VK_FALSE; +} + +VkInstance create_instance(const Runtime::Type type) { + std::vector enabled_instance_layers; + std::vector enabled_instance_extensions; + + if (Runtime::Type::Debug == type) { + uint32_t instance_layers_count = 0; + VK_CHECK(vkEnumerateInstanceLayerProperties( + &instance_layers_count, nullptr)); + + std::vector instance_layer_properties( + instance_layers_count); + + VK_CHECK(vkEnumerateInstanceLayerProperties( + &instance_layers_count, + instance_layer_properties.data())); + + constexpr const char* const requested_instance_layers[]{ + // "VK_LAYER_LUNARG_api_dump", + "VK_LAYER_KHRONOS_validation", + }; + + for (const auto& requested_instance_layer : requested_instance_layers) { + for (const auto& layer : instance_layer_properties) { + if (strcmp(requested_instance_layer, layer.layerName) == 0) { + enabled_instance_layers.push_back(requested_instance_layer); + break; + } + } + } + + uint32_t instance_extension_count = 0; + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &instance_extension_count, nullptr)); + + std::vector instance_extension_properties( + instance_extension_count); + + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &instance_extension_count, instance_extension_properties.data())); + + constexpr const char* const requested_instance_extensions[]{ + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + }; + + for (const auto& requested_instance_extension : requested_instance_extensions) { + for (const auto& extension : instance_extension_properties) { + if (strcmp(requested_instance_extension, extension.extensionName) == 0) { + enabled_instance_extensions.push_back(requested_instance_extension); + break; + } + } + } + } + + constexpr VkApplicationInfo application_info{ + VK_STRUCTURE_TYPE_APPLICATION_INFO, + nullptr, + "PyTorch", + 0, + "PyTorch", + 0, + VK_API_VERSION_1_0, + }; + + const VkInstanceCreateInfo instance_create_info{ + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + nullptr, + 0u, + &application_info, + static_cast(enabled_instance_layers.size()), + enabled_instance_layers.data(), + static_cast(enabled_instance_extensions.size()), + enabled_instance_extensions.data(), + }; + + VkInstance instance{}; + VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); + TORCH_CHECK(instance, "Invalid Vulkan instance!"); + + return instance; +} + +VkDebugReportCallbackEXT create_debug_report_callback( + const VkInstance instance, + const Runtime::Type type) { + if (Runtime::Type::Debug != type) { + return VkDebugReportCallbackEXT{}; + } + + const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ + VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + nullptr, + VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | + VK_DEBUG_REPORT_DEBUG_BIT_EXT, + debug_report_callback_fn, + nullptr, + }; + + const auto vkCreateDebugReportCallbackEXT = + (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance, "vkCreateDebugReportCallbackEXT"); + + TORCH_CHECK( + vkCreateDebugReportCallbackEXT, + "Could not load vkCreateDebugReportCallbackEXT"); + + VkDebugReportCallbackEXT debug_report_callback{}; + VK_CHECK(vkCreateDebugReportCallbackEXT( + instance, + &debugReportCallbackCreateInfo, + nullptr, + &debug_report_callback)); + + TORCH_CHECK( + debug_report_callback, + "Invalid Vulkan debug report callback!"); + + return debug_report_callback; +} + +std::vector acquire_physical_devices( + const VkInstance instance) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); + + uint32_t device_count = 0; + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); + + TORCH_CHECK( + device_count > 0, + "Vulkan: Could not find a device with Vulkan support!"); + + std::vector devices(device_count); + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); + + return devices; +} + +VkPhysicalDeviceProperties query_physical_device_properties( + const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + VkPhysicalDeviceProperties physical_device_properties{}; + vkGetPhysicalDeviceProperties( + physical_device, + &physical_device_properties); + + return physical_device_properties; +} + +VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties( + const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + VkPhysicalDeviceMemoryProperties physical_device_memory_properties{}; + vkGetPhysicalDeviceMemoryProperties( + physical_device, + &physical_device_memory_properties); + + return physical_device_memory_properties; +} + +uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + uint32_t queue_family_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties( + physical_device, &queue_family_count, nullptr); + + TORCH_CHECK( + queue_family_count > 0, + "Vulkan: Invalid number of queue families!"); + + std::vector + queue_families_properties(queue_family_count); + + vkGetPhysicalDeviceQueueFamilyProperties( + physical_device, + &queue_family_count, + queue_families_properties.data()); + + for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { + const VkQueueFamilyProperties& properties = queue_families_properties[i]; + if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { + return i; + } + } + + TORCH_CHECK( + false, + "Vulkan: Could not find a queue family that supports compute operations!"); +} + +} // namespace + +Runtime::Debug::Debug(const VkInstance instance) + : instance_(instance) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); +} + +void Runtime::Debug::operator()( + const VkDebugReportCallbackEXT debug_report_callback) const { + if (debug_report_callback) { + const auto vkDestroyDebugReportCallbackEXT = + (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance_, "vkDestroyDebugReportCallbackEXT"); + + TORCH_CHECK( + vkDestroyDebugReportCallbackEXT, + "Could not load vkDestroyDebugReportCallbackEXT"); + + vkDestroyDebugReportCallbackEXT( + instance_, debug_report_callback, nullptr); + } +} + +Runtime::Runtime(const Type type) + : instance_(create_instance(type), &VK_DELETER(Instance)), + debug_report_callback_( + create_debug_report_callback(instance(), type), + Debug(instance())) { +} + +Adapter Runtime::select(const Selector& selector) { + const std::vector physical_devices = + acquire_physical_devices(instance()); + + for (const VkPhysicalDevice physical_device : physical_devices) { + const Adapter adapter{ + this, + physical_device, + query_physical_device_properties(physical_device), + query_physical_device_memory_properties(physical_device), + query_compute_queue_family_index(physical_device), + }; + + if (selector(adapter)) { + return adapter; + } + } + + TORCH_CHECK( + false, + "Vulkan: no adapter was selected as part of device enumeration!"); +} + +Runtime* initialize() { + static const std::unique_ptr runtime([]() -> Runtime* { +#ifdef USE_VULKAN_WRAPPER + if (!InitVulkan()) { + TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan!"); + return nullptr; + } +#endif + + try { + return new Runtime(Configuration::kRuntime); + } + catch (...) { + return nullptr; + } + }()); + + return runtime.get(); +} + +bool available() { + return initialize(); +} + +Runtime* runtime() { + Runtime* const runtime = initialize(); + TORCH_CHECK( + runtime, + "Vulkan: Backend not available on this platform!" + "Calls to api::runtime() must have been guarded by api::available()."); + + return runtime; +} + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h new file mode 100644 index 000000000000..766aeb50cabc --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Runtime.h @@ -0,0 +1,64 @@ +#pragma once + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +// +// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of +// Vulkan instance initialization from intialization of, and subsequent +// interactions with, Vulkan [physical and logical] devices as a precursor to +// multi-GPU support. The Vulkan Runtime can be queried for available Adapters +// (i.e. physical devices) in the system which in turn can be used for creation +// of a Vulkan Context (i.e. logical devices). All Vulkan tensors in PyTorch +// are associated with a Context to make tensor <-> device affinity explicit. +// + +class Runtime final { + public: + enum class Type { + Debug, + Release, + }; + + explicit Runtime(Type type); + Runtime(const Runtime&) = delete; + Runtime(Runtime&&) = default; + Runtime& operator=(const Runtime&) = delete; + Runtime& operator=(Runtime&&) = default; + ~Runtime() = default; + + inline VkInstance instance() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_); + return instance_.get(); + } + + typedef std::function Selector; + Adapter select(const Selector& selector); + + private: + class Debug final { + public: + explicit Debug(VkInstance instance); + void operator()(VkDebugReportCallbackEXT debug_report_callback) const; + + private: + VkInstance instance_; + }; + + private: + // Construction and destruction order matters. Do not move members around. + Handle instance_; + Handle debug_report_callback_; +}; + +bool available(); +Runtime* runtime(); + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp index 4cde24a2eef9..977f915a61d1 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.cpp +++ b/aten/src/ATen/native/vulkan/api/Shader.cpp @@ -9,11 +9,12 @@ namespace native { namespace vulkan { namespace api { -Shader::Layout::Factory::Factory(const VkDevice device) - : device_(device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, - "Invalid Vulkan device!"); + +Shader::Layout::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( @@ -113,8 +114,8 @@ struct Shader::Factory::Compiler final { #endif /* USE_VULKAN_SHADERC_RUNTIME */ -Shader::Factory::Factory(const VkDevice device) - : device_(device), +Shader::Factory::Factory(const GPU& gpu) + : device_(gpu.device), compiler_(new Compiler) { } diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h index 4a0080cb888d..ff02b2ba9064 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.h +++ b/aten/src/ATen/native/vulkan/api/Shader.h @@ -52,7 +52,7 @@ struct Shader final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Layout::Descriptor Descriptor; typedef VK_DELETER(DescriptorSetLayout) Deleter; @@ -75,8 +75,8 @@ struct Shader final { typedef api::Cache Cache; Cache cache; - explicit Layout(const VkDevice device) - : cache(Factory(device)) { + explicit Layout(const GPU& gpu) + : cache(Factory(gpu)) { } } layout; @@ -122,7 +122,7 @@ struct Shader final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); Factory(const Factory&) = delete; Factory& operator=(const Factory&) = delete; Factory(Factory&&); @@ -152,9 +152,9 @@ struct Shader final { typedef api::Cache Cache; Cache cache; - explicit Shader(const VkDevice device) - : layout(device), - cache(Factory(device)) { + explicit Shader(const GPU& gpu) + : layout(gpu), + cache(Factory(gpu)) { } }; diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h index 394f55d7d525..658824e3bf2b 100644 --- a/aten/src/ATen/native/vulkan/api/api.h +++ b/aten/src/ATen/native/vulkan/api/api.h @@ -2,9 +2,11 @@ #include +#include #include #include #include #include #include +#include #include diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 28c1827485b7..ebf9ffce99d0 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -6,11 +6,6 @@ namespace { -TEST(VulkanAPITest, Context) { - constexpr bool kDebug = true; - ASSERT_NO_THROW(at::native::vulkan::api::Context{kDebug}); -} - } // namespace #endif /* USE_VULKAN_API */ From 1539d4a66478cc2288ab63f971f9074e747018c2 Mon Sep 17 00:00:00 2001 From: Haixin Liu Date: Thu, 24 Sep 2020 15:18:04 -0700 Subject: [PATCH 109/449] Add operator to compute the equalization scale (#45096) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45096 Add operator to compute the equalization scale. This will be used in the integration of equalization into dper int8 fixed quant scheme quantization flow. Design docs: https://fb.quip.com/bb7SAGBxPGNC https://fb.quip.com/PDAOAsgoLfRr Test Plan: buck test caffe2/caffe2/quantization/server:compute_equalization_scale_test Reviewed By: jspark1105 Differential Revision: D23779870 fbshipit-source-id: 5e6a8c220935a142ecf8e61100a8c71932afa8d7 --- caffe2/opt/bound_shape_inferencer.cc | 4 +- .../server/compute_equalization_scale.cc | 96 +++++++++++++++++++ .../server/compute_equalization_scale.h | 18 ++++ .../server/compute_equalization_scale_test.py | 89 +++++++++++++++++ 4 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 caffe2/quantization/server/compute_equalization_scale.cc create mode 100644 caffe2/quantization/server/compute_equalization_scale.h create mode 100644 caffe2/quantization/server/compute_equalization_scale_test.py diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc index d37717d5b957..d8fe956a0ddd 100644 --- a/caffe2/opt/bound_shape_inferencer.cc +++ b/caffe2/opt/bound_shape_inferencer.cc @@ -857,7 +857,8 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { try { const static std::unordered_set types_with_independent_output_shape = {"Int8GenQuantParams", - "Int8QuantSchemeBlobFill"}; + "Int8QuantSchemeBlobFill", + "ComputeEqualizationScale"}; std::vector input_shapes; for (const auto& input : op.input()) { const auto it = shape_info_.find(input); @@ -883,6 +884,7 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { bool is_quantized = !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize") && (op.type() != "Int8QuantSchemeBlobFill") && + (op.type() != "ComputeEqualizationScale") && (op.type() != "Int8GenQuantParams"); float scale = 1; int offset = 0; diff --git a/caffe2/quantization/server/compute_equalization_scale.cc b/caffe2/quantization/server/compute_equalization_scale.cc new file mode 100644 index 000000000000..6e2f73ebd840 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale.cc @@ -0,0 +1,96 @@ +// Copyright 2004-present Facebook. All Rights Reserved. +#include "caffe2/quantization/server/compute_equalization_scale.h" +#include + +namespace caffe2 { +using namespace std; + +bool ComputeEqualizationScaleOp::RunOnDevice() { + // Generate equalization scale based on the input data (last N samples of + // the activations) and the weight + const auto& X = Input(0); + const auto& W = Input(1); + CAFFE_ENFORCE_EQ(X.dim(), 2); + CAFFE_ENFORCE_EQ(W.dim(), 2); + + const int64_t M = X.size_to_dim(1); + const int64_t N = W.size_to_dim(1); + const int64_t K = W.size_from_dim(1); + auto* S = Output(0, K, at::dtype()); + auto* S_INV = Output(1, K, at::dtype()); + const float* X_data = X.template data(); + const float* W_data = W.template data(); + float* S_data = S->template mutable_data(); + float* S_INV_data = S_INV->template mutable_data(); + + float WcolMax, XcolMax; + for (int64_t j = 0; j < K; j++) { + WcolMax = std::abs(W_data[j]); + XcolMax = std::abs(X_data[j]); + int64_t idx; + for (int64_t i = 0; i < N; i++) { + idx = i * K + j; + WcolMax = std::max(WcolMax, std::abs(W_data[idx])); + } + for (int64_t i = 0; i < M; i++) { + idx = i * K + j; + XcolMax = std::max(XcolMax, std::abs(X_data[idx])); + } + if (WcolMax == 0 || XcolMax == 0) { + S_data[j] = 1; + S_INV_data[j] = 1; + } else { + S_data[j] = std::sqrt(WcolMax / XcolMax); + S_INV_data[j] = 1 / S_data[j]; + } + } + return true; +} + +REGISTER_CPU_OPERATOR(ComputeEqualizationScale, ComputeEqualizationScaleOp); +OPERATOR_SCHEMA(ComputeEqualizationScale) + .NumInputs(2) + .NumOutputs(2) + .SetDoc(R"DOC( +Given a weight matrix W and input matrix X, the output S is the equalization parameter +vector computed from W and X, and S_INV = 1 / S + +S is computed by: +S[j] = max(abs(W[][j])) == 0 || max(abs(X[][j])) == 0 ? 1 : + sqrt(max(abs(W[][j])) / max(abs(X[][j]))), + +)DOC") + .TensorInferenceFunction([](const OperatorDef& /* def */, + const vector& in) { + vector out(2); + + if (in[0].unknown_shape() || in[1].unknown_shape()) { + out[0].set_unknown_shape(true); + out[1].set_unknown_shape(true); + return out; + } + const int64_t K = size_from_dim_(1, GetDimsVector(in[1])); + vector s_shape(2); + s_shape[0] = 1; + s_shape[1] = K; + out[0] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT); + out[1] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT); + return out; + }) + .Input( + 0, + "X", + "The input data, or last N samples of the output activations.") + .Input(1, "W", "The weight that we want to equalize with the input.") + .Output( + 0, + "S", + "Scale computed that will be multiplied to the columns of input.") + .Output( + 1, + "S_INV", + "Scale inverse that will be multiplied to the columns of weight.") + .SetDoc( + R"DOC(Operator to compute equalization scale given the input data and weight)DOC"); + +} // namespace caffe2 diff --git a/caffe2/quantization/server/compute_equalization_scale.h b/caffe2/quantization/server/compute_equalization_scale.h new file mode 100644 index 000000000000..a9facf8e1206 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale.h @@ -0,0 +1,18 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#pragma once +#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" +#include "caffe2/quantization/server/dnnlowp.h" + +namespace caffe2 { + +class ComputeEqualizationScaleOp final : public Operator { + public: + ComputeEqualizationScaleOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override; + +}; // class ComputeEqualizationScaleOp + +} // namespace caffe2 diff --git a/caffe2/quantization/server/compute_equalization_scale_test.py b/caffe2/quantization/server/compute_equalization_scale_test.py new file mode 100644 index 000000000000..74d34c5502d3 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale_test.py @@ -0,0 +1,89 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import, division, print_function, unicode_literals + +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np +from caffe2.python import core, workspace +from hypothesis import given, settings + + +class TestComputeEqualizationScaleOp(hu.HypothesisTestCase): + @settings(max_examples=10) + @given( + m=st.integers(1, 50), + n=st.integers(1, 50), + k=st.integers(1, 50), + rnd_seed=st.integers(1, 5), + **hu.gcs_cpu_only + ) + def test_compute_equalization_scale(self, m, n, k, rnd_seed, gc, dc): + np.random.seed(rnd_seed) + W = np.random.rand(n, k).astype(np.float32) - 0.5 + X = np.random.rand(m, k).astype(np.float32) - 0.5 + + def ref_compute_equalization_scale(X, W): + S = np.ones([X.shape[1]]) + S_INV = np.ones([X.shape[1]]) + for j in range(W.shape[1]): + WcolMax = np.absolute(W[:, j]).max() + XcolMax = np.absolute(X[:, j]).max() + if WcolMax and XcolMax: + S[j] = np.sqrt(WcolMax / XcolMax) + S_INV[j] = 1 / S[j] + return S, S_INV + + net = core.Net("test") + + ComputeEqualizationScaleOp = core.CreateOperator( + "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"] + ) + net.Proto().op.extend([ComputeEqualizationScaleOp]) + + self.ws.create_blob("X").feed(X, device_option=gc) + self.ws.create_blob("W").feed(W, device_option=gc) + self.ws.run(net) + + S = self.ws.blobs["S"].fetch() + S_INV = self.ws.blobs["S_INV"].fetch() + S_ref, S_INV_ref = ref_compute_equalization_scale(X, W) + np.testing.assert_allclose(S, S_ref, atol=1e-3, rtol=1e-3) + np.testing.assert_allclose(S_INV, S_INV_ref, atol=1e-3, rtol=1e-3) + + def test_compute_equalization_scale_shape_inference(self): + X = np.array([[1, 2], [2, 4], [6, 7]]).astype(np.float32) + W = np.array([[2, 3], [5, 4], [8, 2]]).astype(np.float32) + ComputeEqualizationScaleOp = core.CreateOperator( + "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"] + ) + workspace.FeedBlob("X", X) + workspace.FeedBlob("W", W) + + net = core.Net("test_shape_inference") + net.Proto().op.extend([ComputeEqualizationScaleOp]) + shapes, types = workspace.InferShapesAndTypes( + [net], + blob_dimensions={"X": X.shape, "W": W.shape}, + blob_types={"X": core.DataType.FLOAT, "W": core.DataType.FLOAT}, + ) + assert ( + "S" in shapes and "S" in types and "S_INV" in shapes and "S_INV" in types + ), "Failed to infer the shape or type of output" + self.assertEqual(shapes["S"], [1, 2]) + self.assertEqual(shapes["S_INV"], [1, 2]) + self.assertEqual(types["S"], core.DataType.FLOAT) + self.assertEqual(types["S_INV"], core.DataType.FLOAT) From 0137e3641d61983dbd22b2b2f9cc8c002e86aab4 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 24 Sep 2020 15:22:16 -0700 Subject: [PATCH 110/449] Refactor subgraph merging (#44238) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44238 Refactor create_autodiff_subgraphs to use the same updating of output aliasing properties logic as tensorexpr fuser, and factor that out to a common function in subgraph utils. Test Plan: Imported from OSS Reviewed By: Krovatkin, robieta Differential Revision: D23871565 Pulled By: eellison fbshipit-source-id: 72df253b16baf8e4aabf3d68b103b29e6a54d44c --- .../jit/passes/create_autodiff_subgraphs.cpp | 73 +------------- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 63 +----------- .../csrc/jit/passes/utils/subgraph_utils.cpp | 97 +++++++++++++++++++ torch/csrc/jit/passes/utils/subgraph_utils.h | 16 +++ 4 files changed, 121 insertions(+), 128 deletions(-) diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp index 11bee519292c..6ac510b13777 100644 --- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp +++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp @@ -13,66 +13,6 @@ namespace jit { namespace { -std::vector> gatherLastUses( - at::ArrayRef values) { - return fmap(values, [&](Value* v) -> c10::optional { - return firstOrLastUse(v, /*find_first*/ false); - }); -} - -// When merging a node into a subgraph, we wish to preserve all of the -// aliasing properties of the node's outputs. It is difficult to track -// the node or its contained nodes through all of the ir manipulation -// involved in merging; it is pretty easy to uniquely identify the value -// based on its uses. We can identify the value by its last use in the graph. -// Values which do not have uses or which do not have a last use -// outside of the subgraph to be merged into we do not need to track. -struct ValueMapper { - ValueMapper(Node* n, AliasDb& db, size_t subgraph_num_outputs) { - last_uses_ = gatherLastUses(n->outputs()); - subgraph_num_outputs_ = subgraph_num_outputs; - WithInsertPoint guard(n); - auto g = n->owningGraph(); - // temporary node to put the aliasing properties of the node before its - // merged and destroyed - placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0)); - for (size_t i = 0; i < n->outputs().size(); ++i) { - Value* existing = n->outputs().at(i); - Value* new_value = - placeholder_node_->insertOutput(i)->copyMetadata(n->outputs().at(i)); - db.replaceWithNewValue(existing, new_value); - } - } - - bool usesEqual(const Use& a, const Use& b) { - return a.user == b.user && a.offset == b.offset; - } - - void copyAliasing(Node* merged_node, AliasDb& db) { - auto num_outputs = merged_node->outputs().size(); - auto new_outputs = merged_node->outputs().slice( - subgraph_num_outputs_, num_outputs - subgraph_num_outputs_); - for (Value* v : new_outputs) { - auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false); - // if it doesnt have a use it shouldnt have been added as output - TORCH_INTERNAL_ASSERT(maybe_last_use); - const Use last_use = *maybe_last_use; - size_t i = 0; - while (i < last_uses_.size() && last_uses_.at(i).has_value() && - !usesEqual(*last_uses_.at(i), last_use)) { - ++i; - } - TORCH_INTERNAL_ASSERT(i != last_uses_.size()); - db.replaceWithNewValue(placeholder_node_->outputs().at(i), v); - } - placeholder_node_->destroy(); - } - - std::vector> last_uses_; - size_t subgraph_num_outputs_; - Node* placeholder_node_; -}; - struct WorkBlock : public std::pair { using pair::pair; @@ -285,11 +225,8 @@ class SubgraphSlicer { std::pair scanNode(Node* consumer) { if (shouldConsiderForMerge(consumer)) { if (consumer->kind() != prim::DifferentiableGraph) { - // ValueMapper preserves the aliasing information of the node's outputs - ValueMapper vm(consumer, aliasDb_, 0); - consumer = SubgraphUtils::createSingletonSubgraph( - consumer, prim::DifferentiableGraph); - vm.copyAliasing(consumer, aliasDb_); + consumer = SubgraphUtils::createSingletonSubgraphAndUpdateAliasing( + consumer, prim::DifferentiableGraph, aliasDb_); } auto inputs = sortReverseTopological(consumer->inputs()); for (auto input : inputs) { @@ -315,10 +252,8 @@ class SubgraphSlicer { return c10::nullopt; } - ValueMapper vm(producer, aliasDb_, consumer->outputs().size()); - SubgraphUtils::mergeNodeIntoSubgraph(producer, consumer); - vm.copyAliasing(consumer, aliasDb_); - + SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing( + producer, consumer, aliasDb_); return consumer; } diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 4d98110d3975..67a232d94088 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -305,69 +305,13 @@ class TensorExprFuser { } private: - // Merges `to_merge` into a subgraph by executing merge_fn. - // merge_fn takes in map that will be filled with the mapping b/w - // to_merge's outputs and the corresponding values in the subgraph. - // merge_fn returns the merged-into subgraph - Node* aliasingSafeSubgraphMerge( - Node* to_merge, - const std::function&)>& - merge_fn) { - // When we merge a node into a subgraph, the new subgraph outputs - // have the same aliasing properties as the original node's outputs. - // Here we create a placeholder node, transfer the aliasing properties - // to the placeholder, execute the merge, and transfer the aliasing - // properties to the appropriate fusion group outputs - Node* placeholder_node = - graph_->insertNode(graph_->create(prim::Uninitialized, 0)); - std::vector existing_values; - for (size_t i = 0; i < to_merge->outputs().size(); ++i) { - Value* existing = to_merge->outputs().at(i); - Value* new_value = placeholder_node->insertOutput(i)->copyMetadata( - to_merge->outputs().at(i)); - aliasDb_->replaceWithNewValue(existing, new_value); - existing_values.push_back(existing); - } - std::unordered_map vmap; - Node* fusion_group = merge_fn(vmap); - for (size_t i = 0; i < existing_values.size(); ++i) { - TORCH_INTERNAL_ASSERT(vmap.count(existing_values.at(i))); - Value* subgraph_value = vmap[existing_values.at(i)]; - auto subgraph = SubgraphUtils::getSubgraph(fusion_group); - size_t subgraph_output_index = 0; - for (; subgraph_output_index < subgraph->outputs().size(); - ++subgraph_output_index) { - if (subgraph->outputs().at(subgraph_output_index) == subgraph_value) { - break; - } - } - if (subgraph_output_index != subgraph->outputs().size()) { - aliasDb_->replaceWithNewValue( - placeholder_node->outputs().at(i), - fusion_group->outputs().at(subgraph_output_index)); - } - } - placeholder_node->destroy(); - return fusion_group; - } - Node* getOrCreateTensorExprSubgraph(Node* n) { if (n->hasAttribute(attr::Subgraph) && n->kind() == prim::TensorExprGroup) { return n; } GRAPH_UPDATE("Creating a tensorexpr::Group node from: ", *n); - return aliasingSafeSubgraphMerge( - n, [&](std::unordered_map& vmap) { - return SubgraphUtils::createSingletonSubgraph( - n, prim::TensorExprGroup, vmap); - }); - } - - void mergeNodeIntoSubgraphAndUpdateAliasing(Node* n, Node* subgraph) { - aliasingSafeSubgraphMerge(n, [&](std::unordered_map& vmap) { - SubgraphUtils::mergeNodeIntoSubgraph(n, subgraph, vmap); - return subgraph; - }); + return SubgraphUtils::createSingletonSubgraphAndUpdateAliasing( + n, prim::TensorExprGroup, *aliasDb_); } // Add unvisited input nodes to the queue for further merging into the fusion @@ -557,7 +501,8 @@ class TensorExprFuser { for (auto n : nodes_to_merge) { GRAPH_UPDATE("Merging ", getHeader(n)); - mergeNodeIntoSubgraphAndUpdateAliasing(n, fusion_group); + SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing( + n, fusion_group, *aliasDb_); } return fusion_group; } diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp index 6c9aad77cf93..1576aca36fa8 100644 --- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp +++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp @@ -1,4 +1,5 @@ #include +#include namespace torch { namespace jit { @@ -9,6 +10,82 @@ bool hasSubgraph(Node* n) { return n->hasAttribute(attr::Subgraph); } +std::vector> gatherLastUses( + at::ArrayRef values) { + return fmap(values, [&](Value* v) -> c10::optional { + return firstOrLastUse(v, /*find_first*/ false); + }); +} + +// When merging a node into a subgraph, we wish to preserve all of the +// aliasing properties of the node's outputs. It is difficult to track +// the node or its contained nodes through all of the ir manipulation +// involved in merging; it is pretty easy to uniquely identify the value +// based on its uses. We can identify the value by its last use in the graph. +// Values which do not have uses or which do not have a last use +// outside of the subgraph to be merged into we do not need to track. +struct ValueMapper { + ValueMapper(Node* to_merge, AliasDb& db, size_t subgraph_num_outputs) { + last_uses_ = gatherLastUses(to_merge->outputs()); + subgraph_num_outputs_ = subgraph_num_outputs; + WithInsertPoint guard(to_merge); + auto g = to_merge->owningGraph(); + // temporary node to put the aliasing properties of the node before its + // merged and destroyed + placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0)); + for (size_t i = 0; i < to_merge->outputs().size(); ++i) { + Value* existing = to_merge->outputs().at(i); + Value* new_value = placeholder_node_->insertOutput(i)->copyMetadata( + to_merge->outputs().at(i)); + db.replaceWithNewValue(existing, new_value); + } + } + + bool usesEqual(const Use& a, const Use& b) { + return a.user == b.user && a.offset == b.offset; + } + + void copyAliasing(Node* merged_node, AliasDb& db) { + auto num_outputs = merged_node->outputs().size(); + auto new_outputs = merged_node->outputs().slice( + subgraph_num_outputs_, num_outputs - subgraph_num_outputs_); + for (Value* v : new_outputs) { + auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false); + // if it doesnt have a use it shouldnt have been added as output + TORCH_INTERNAL_ASSERT(maybe_last_use); + const Use last_use = *maybe_last_use; + size_t i = 0; + while (i < last_uses_.size() && last_uses_.at(i).has_value() && + !usesEqual(*last_uses_.at(i), last_use)) { + ++i; + } + TORCH_INTERNAL_ASSERT(i != last_uses_.size()); + db.replaceWithNewValue(placeholder_node_->outputs().at(i), v); + } + placeholder_node_->destroy(); + } + + std::vector> last_uses_; + size_t subgraph_num_outputs_; + Node* placeholder_node_; +}; + +Node* executeSubgraphMergeAndUpdateAliasing( + Node* to_merge, + c10::optional existing, + AliasDb& db, + const std::function& merge_fn) { + // When we merge a node into a subgraph, the new subgraph outputs + // have the same aliasing properties as the original node's outputs. + // Here we create a placeholder node, transfer the aliasing properties + // to the placeholder, execute the merge, and transfer the aliasing + // properties to the appropriate fusion group outputs + ValueMapper vm(to_merge, db, existing ? (*existing)->outputs().size() : 0); + Node* fusion_group = merge_fn(); + vm.copyAliasing(fusion_group, db); + return fusion_group; +} + // Combine the nodes in two subgraph together. The nodes will end up in // `mergeTo`, and `mergeFrom` is destroyed. void mergeSubgraph( @@ -281,6 +358,26 @@ Node* createSingletonSubgraph(Node* n, Symbol subgraphKind) { return createSingletonSubgraph(n, subgraphKind, vmap); } +void mergeNodeIntoSubgraphAndUpdateAliasing( + Node* to_merge, + Node* subgraphNode, + AliasDb& db) { + executeSubgraphMergeAndUpdateAliasing(to_merge, subgraphNode, db, [&]() { + mergeNodeIntoSubgraph(to_merge, subgraphNode); + return subgraphNode; + }); +} + +Node* createSingletonSubgraphAndUpdateAliasing( + Node* to_merge, + Symbol subgraphKind, + AliasDb& db) { + return executeSubgraphMergeAndUpdateAliasing( + to_merge, c10::nullopt, db, [&]() { + return createSingletonSubgraph(to_merge, subgraphKind); + }); +} + } // namespace SubgraphUtils } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h index 94150258b5fa..77c3d388425f 100644 --- a/torch/csrc/jit/passes/utils/subgraph_utils.h +++ b/torch/csrc/jit/passes/utils/subgraph_utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace torch { @@ -26,6 +27,13 @@ TORCH_API Node* createSingletonSubgraph( Symbol subgraphKind, std::unordered_map& vmap); +// Creates a new subgraph that only contains `n`, amd udpates the new outputs +// of the subgraph to have the aliasing properties of the original `n` outputs +TORCH_API Node* createSingletonSubgraphAndUpdateAliasing( + Node* to_merge, + Symbol subgraphKind, + AliasDb& db); + // Merge a node into a subgraph node. If `toMerge` is also a subgraph, the // subgraphs are merged. // `toMerge` is destroyed. @@ -37,6 +45,14 @@ TORCH_API void mergeNodeIntoSubgraph( Node* subgraphNode, std::unordered_map& vmap); +// Merges a node into a subgraph node, and updates the new outputs of the +// subgraph to have the aliasing properties of the corresponding `to_merge` +// outputs +TORCH_API void mergeNodeIntoSubgraphAndUpdateAliasing( + Node* to_merge, + Node* subgraphNode, + AliasDb& db); + // Move nodes from a subgraph node to the outer graph. // `subgraphNode` is destroyed. // An optional argument 'vmap' could be used to retrieve value mappings. From 5dd288eb066bc178a89447453c7fba961a3e0174 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 24 Sep 2020 15:22:16 -0700 Subject: [PATCH 111/449] [JIT] Regularize tensorexpr fuser strategy with other fusers (#44972) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44972 Previously, our fusion strategy would be: - start at the end of the block, find a fusable node - iteratively try to merge inputs into the fusion group, sorted topologically This strategy works pretty well, but has the possibility of missing fusion groups. See my attached test case for an example where we wouldn't find all possible fusion groups. bertmaher found an example of a missed fusion groups in one of our rnn examples (jit_premul) that caused a regression from the legacy fuser. Here, I'm updating our fusion strategy to be the same as our other fusion passes - create_autodiff_subgraphs, and graph_fuser.cpp. The basic strategy is: - iterate until you find a fusible node - try to merge the nodes inputs, whenever a succesful merge occurs restart at the beginning of the nodes inputs - after you've exhausted a node, continue searching the block for fusion opportunities from the node - continue doing this on the block until we go through an iteration without an succesful merges Since we create the fusion groups once, and only re-specialize within the fusion groups, we should be running this very infrequently (only re-triggers when we fail undefinedness specializations). Also bc it's the same algorithm as the existing fuser it is unlikely to cause a regression. Test Plan: Imported from OSS Reviewed By: Krovatkin, robieta Differential Revision: D23821581 Pulled By: eellison fbshipit-source-id: e513d1ef719120dadb0bfafc7a14f4254cd806ee --- test/jit/test_profiler.py | 22 +++ torch/csrc/jit/passes/tensorexpr_fuser.cpp | 184 +++++++++--------- .../csrc/jit/passes/utils/subgraph_utils.cpp | 28 ++- torch/csrc/jit/passes/utils/subgraph_utils.h | 10 +- 4 files changed, 146 insertions(+), 98 deletions(-) diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index 50d4351a4870..55604f5ff6bf 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -83,6 +83,7 @@ def test_fuse(a, b): # that guards a tensorexpr group optimized_block = next(g.findNode("prim::If").blocks()) if_nodes = list(optimized_block.findAllNodes("prim::If")) + self.assertEqual(len(if_nodes), 1) FileCheck().check("Group[Subgraph").run(str(if_nodes[0])) # no broadcasts occurred, sum_to_size have been specialized out @@ -191,3 +192,24 @@ def foo(a, b): g = torch.jit.last_executed_optimized_graph() FileCheck().check("fallback_function").check_next("CallFunction").run(g) + + def test_iterative_fusion(self): + @torch.jit.script + def foo(a, b, c, d): + a = a + b + b.add_(3) + c = c + b + d + a = a + 1 + return a, c + + x = torch.ones(1, requires_grad=False) + foo(x, x, x, x) + foo(x, x, x, x) + + # when we iterate through the block, we will start + # by fusing a = a + b with a = a + 1 + # if we were to continue iteration from that fusion point, + # would miss the fusion opportunity of c = c + d + b + + g = torch.jit.last_executed_optimized_graph() + self.assertEqual(len(list(g.findAllNodes("prim::TensorExprGroup"))), 2) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 67a232d94088..3782c2af4f33 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -166,6 +166,7 @@ bool isSupported(Node* node) { switch (node->kind()) { case prim::ConstantChunk: case prim::ListConstruct: + case prim::TensorExprGroup: return true; } @@ -201,12 +202,6 @@ bool texprReductionsEnabled() { return texpr_reductions_enabled; } -struct nodesComparator { - bool operator()(Node* a, Node* b) const { - return a->isAfter(b); - } -}; - // TODO: if a value has differently typed uses, temporarrily insert a node // specializing the type for each use and later remove, instead of bailing bool profiledWithDifferentTypes(Value* v) { @@ -298,6 +293,11 @@ class TensorExprFuser { GRAPH_DUMP("After removing redundant profile nodes: ", graph_); createFusionGroups(graph_->block()); GRAPH_DUMP("After creating fusion groups: ", graph_); + // we maintain alias db correctness during initial fusion, but it is + // difficult to maintain correctness after inlining so inline only after + // fusion is done. + inlineSmallFusionGroups(graph_->block()); + GRAPH_DUMP("After inlining small fusion groups: ", graph_); guardFusionGroups(graph_->block()); GRAPH_DUMP("After guarding fusion groups: ", graph_); removeTensorTypeSpecializations(graph_->block()); @@ -314,56 +314,44 @@ class TensorExprFuser { n, prim::TensorExprGroup, *aliasDb_); } - // Add unvisited input nodes to the queue for further merging into the fusion - // group. - void updateQueue( - Node* fusion_group, - std::set& queue, - const std::unordered_set& visited) { - for (auto input : fusion_group->inputs()) { - if (!visited.count(input->node())) { - queue.insert(input->node()); + value_list sortReverseTopological(ArrayRef inputs, Block* b) { + value_list result; + for (auto i : inputs) { + if (i->node()->owningBlock() == b) { + result.push_back(i); } } + // Sort in reverse topological order + std::sort(result.begin(), result.end(), [&](Value* a, Value* b) { + return a->node()->isAfter(b->node()); + }); + return result; } // Create a fusion group starting from the node N. // We then try to pull inputs into the fusion group and repeat that process // until there is nothing we can pull in. - Node* createFusionGroup(Node* n) { - // Queue of the nodes we should consider for merging into the fusion groups - // (those nodes are usually inputs of the fusion group). - // We use an ordered set here to visit them in the right order: the fusion - // group is closer to the end of the block and we are trying to pull later - // nodes first. - // NB: the order in the list in theory could stale if we move nodes around. - // However, this should only happen to the nodes we could not fuse, and - // hence it should not be a problem. - std::set queue; - std::unordered_set visited_nodes; - - Node* fusion_group = n; + std::pair createFusionGroup( + Node* fusion_node) { if (min_group_size_ == 1) { - fusion_group = getOrCreateTensorExprSubgraph(n); + fusion_node = getOrCreateTensorExprSubgraph(fusion_node); } - updateQueue(fusion_group, queue, visited_nodes); - GRAPH_DEBUG("Iteratively pull input nodes into the fusion group...\n"); - while (!queue.empty()) { - debugDumpFusionGroup("Current fusion group: ", fusion_group); - GRAPH_DEBUG(queue.size(), " nodes are in the queue.\n"); - - Node* input_node = *queue.begin(); - queue.erase(queue.begin()); - - GRAPH_DEBUG("Trying to merge: ", *input_node); - fusion_group = tryMerge(fusion_group, input_node); - visited_nodes.insert(input_node); - updateQueue(fusion_group, queue, visited_nodes); + auto inputs = sortReverseTopological( + fusion_node->inputs(), fusion_node->owningBlock()); + for (auto input : inputs) { + debugDumpFusionGroup("Current fusion group: ", fusion_node); + GRAPH_DEBUG("Trying to merge: ", *input->node()); + if (auto maybe_fusion_group = tryMerge(fusion_node, input->node())) { + // we successfully merged, so the new group's `inputs` may have + // changed. So rescan the new group for more merging opportunities. + return std::make_pair( + maybe_fusion_group.value()->reverseIterator(), true); + } } - return fusion_group; + return std::make_pair(++fusion_node->reverseIterator(), false); } static void debugDumpFusionGroup(const std::string& msg, Node* n) { @@ -373,69 +361,75 @@ class TensorExprFuser { } } + std::pair scanNode(Node* n) { + GRAPH_DEBUG("Considering node:", *n) + + if (!canHandle(n)) { + return std::make_pair(++n->reverseIterator(), false); + } + // There are some nodes that we can support, but we don't want to start a + // fusion group from - skip them. + if (n->kind() == prim::ListConstruct || n->kind() == aten::slice || + n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk || + n->kind() == prim::Constant) { + return std::make_pair(++n->reverseIterator(), false); + } + return createFusionGroup(n); + } + // Merge fusible nodes into subgraphs in prim::TensorExprGroup nodes. void createFusionGroups(Block* block) { - std::vector fusion_groups; - auto reverse_iter = block->nodes().reverse(); - Node* prev_fusion_group = nullptr; - for (auto it = reverse_iter.begin(); it != reverse_iter.end();) { - Node* n = *it; - GRAPH_DEBUG("Considering node:", *n) + bool any_changed = true; + while (any_changed) { + any_changed = false; + for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) { + bool changed; + std::tie(it, changed) = scanNode(*it); + any_changed |= changed; + } + } + for (Node* n : block->nodes()) { for (Block* b : n->blocks()) { createFusionGroups(b); } + } - if (!canHandle(n)) { - it++; - continue; - } - // There are some nodes that we can support, but we don't want to start a - // fusion group from - skip them. - if (n->kind() == prim::ListConstruct || n->kind() == aten::slice || - n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk || - n->kind() == prim::Constant) { - it++; - continue; + // Try to merge adjacent fusion groups together. Because we have only merged + // by looking at graph inputs, without this we would not attempt to merge + // adjacent fusion groups that don't have a depdency on each other + + std::vector initial_fusion_groups; + for (Node* n : block->nodes()) { + if (n->kind() == prim::TensorExprGroup) { + initial_fusion_groups.push_back(n); } + } - Node* fusion_group = createFusionGroup(n); - debugDumpFusionGroup("Fusion group constructed: ", fusion_group); + Node* prev_fusion_group = + initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr; + for (size_t i = 1; i < initial_fusion_groups.size(); ++i) { // Try merging the just created fusion group into the previous one. // If it did not work, then put the previous fusion group into // fusion_groups vector - we will not touch it anymore in this loop. // If merging suceeded, save the merged group as the "previous" fusion // group so that we can try to merge the next one into it. - if (prev_fusion_group) { + + Node* fusion_group = initial_fusion_groups[i]; + debugDumpFusionGroup( + "Trying to merge into the previous fusion group: ", + prev_fusion_group); + if (auto merged_fusion_group = + tryMerge(prev_fusion_group, fusion_group)) { + prev_fusion_group = *merged_fusion_group; debugDumpFusionGroup( - "Trying to merge into the previous fusion group: ", + "Successfully merged into the previous fusion group: ", prev_fusion_group); - if (canMerge(prev_fusion_group, fusion_group)) { - prev_fusion_group = tryMerge(prev_fusion_group, fusion_group); - debugDumpFusionGroup( - "Successfully merged into the previous fusion group: ", - prev_fusion_group); - } else { - GRAPH_DEBUG("Cannot merge into the previous fusion group"); - fusion_groups.push_back(prev_fusion_group); - prev_fusion_group = fusion_group; - } } else { + GRAPH_DEBUG("Cannot merge into the previous fusion group"); prev_fusion_group = fusion_group; } - it = prev_fusion_group->reverseIterator(); - it++; - } - - // We were adding groups into the vector lagging by one - catch up with - // adding the last one - if (prev_fusion_group) { - fusion_groups.push_back(prev_fusion_group); - } - - for (Node* n : fusion_groups) { - inlineIfTooSmall(n); } } @@ -471,9 +465,21 @@ class TensorExprFuser { return false; } - Node* tryMerge(Node* fusion_group, Node* to_merge) { + void inlineSmallFusionGroups(Block* block) { + for (auto it = block->nodes().begin(); it != block->nodes().end();) { + Node* n = *it; + it++; + + for (Block* b : n->blocks()) { + inlineSmallFusionGroups(b); + } + inlineIfTooSmall(n); + } + } + + c10::optional tryMerge(Node* fusion_group, Node* to_merge) { if (!canMerge(fusion_group, to_merge)) { - return fusion_group; + return c10::nullopt; } std::vector nodes_to_merge = {to_merge}; @@ -490,7 +496,7 @@ class TensorExprFuser { GRAPH_UPDATE("Trying to move node next to fusion group: ", getHeader(n)); if (!aliasDb_->moveBeforeTopologicallyValid(n, move_point)) { GRAPH_UPDATE("Failed to move because of AliasDB checks!"); - return fusion_group; + return c10::nullopt; } move_point = n; } diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp index 1576aca36fa8..73976cb66bc8 100644 --- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp +++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp @@ -117,11 +117,20 @@ void mergeSubgraph( // Now we're merging the "unmerged" nodes into the mergeFrom subgraph. That // will give us a new map: "unmerged" -> "merged". std::unordered_map merge_vmap; + + // defer destroying nodes until after all nodes have been merged, otherwise we + // run into lifetime issues where the previous mapping of the merged nodes + // inputs/outputs can be overwritten with newly created values + std::vector merged_nodes; while (it != end_it) { - // NB: mergeNodeIntoSubgraph destroys node, hence the complications Node* node = *it; ++it; - mergeNodeIntoSubgraph(node, mergeTo, merge_vmap); + merged_nodes.push_back(node); + mergeNodeIntoSubgraph(node, mergeTo, merge_vmap, /*destroyNode*/ false); + } + + for (Node* n : merged_nodes) { + n->destroy(); } // Vmap should contain "original" -> "merged" mapping, thus we basically need @@ -228,7 +237,8 @@ std::unordered_set closedOverValues( void mergeNodeIntoSubgraph( Node* toMerge, Node* subgraphNode, - std::unordered_map& vmap) { + std::unordered_map& vmap, + bool destroyNode) { AT_ASSERT(hasSubgraph(subgraphNode) && toMerge != subgraphNode); if (hasSubgraph(toMerge)) { return mergeSubgraph(subgraphNode, toMerge, vmap); @@ -334,11 +344,17 @@ void mergeNodeIntoSubgraph( } } // Remove the original node now that the merge is complete - toMerge->destroy(); + if (destroyNode) { + toMerge->destroy(); + } } -void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode) { + +void mergeNodeIntoSubgraph( + Node* toMerge, + Node* subgraphNode, + bool destroyNode) { std::unordered_map vmap; - mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap); + mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap, destroyNode); } Node* createSingletonSubgraph( diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h index 77c3d388425f..c0ffc3635031 100644 --- a/torch/csrc/jit/passes/utils/subgraph_utils.h +++ b/torch/csrc/jit/passes/utils/subgraph_utils.h @@ -36,14 +36,18 @@ TORCH_API Node* createSingletonSubgraphAndUpdateAliasing( // Merge a node into a subgraph node. If `toMerge` is also a subgraph, the // subgraphs are merged. -// `toMerge` is destroyed. +// If `destroyNode` is true `toMerge` is destroyed. // An optional argument 'vmap' could be used to retrieve value mappings. // Values will be mapped to their new subgraph values -TORCH_API void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode); TORCH_API void mergeNodeIntoSubgraph( Node* toMerge, Node* subgraphNode, - std::unordered_map& vmap); + bool destroyNode = true); +TORCH_API void mergeNodeIntoSubgraph( + Node* toMerge, + Node* subgraphNode, + std::unordered_map& vmap, + bool destroyNode = true); // Merges a node into a subgraph node, and updates the new outputs of the // subgraph to have the aliasing properties of the corresponding `to_merge` From bee1d448e76837e7ffc066fcad576ccb98e92ee1 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Thu, 24 Sep 2020 15:55:35 -0700 Subject: [PATCH 112/449] Fix test_rpc_profiling_remote_record_function (#45162) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45162 This test was flaky because it was not able to validate that the overall record_function's CPU times are greater than the sum of its children. It turns out that this is a general bug in the profiler that can be reproduced without RPC, see https://github.com/pytorch/pytorch/issues/45160. Hence, removing this from the test and replacing it by just validating the expected children. Ran the test 1000 times and they all passed. ghstack-source-id: 112632327 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23851854 fbshipit-source-id: 5d9023acd17800a6668ba4849659d8cc902b8d6c --- .../_internal/distributed/rpc/rpc_test.py | 48 ++++++++++++++----- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 797e5a010b86..163a772628a5 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -1302,19 +1302,41 @@ def test_rpc_profiling_remote_record_function(self): "aten::zero_", "aten::fill_", ] - remote_ops_time = sum( - evt.cpu_time_total - for evt in remaining_remote_events - if not any( - [ - rf_entry_event in evt.name - for rf_entry_event in remote_events_denylist - ] - ) - ) - self.assertGreaterEqual( - record_function_remote_event.cpu_time_total, remote_ops_time - ) + + REMOTE_OP_STR = "#remote_op: " + + def convert_remote_to_local(event_name): + remote_op_key = REMOTE_OP_STR + return event_name[event_name.find(remote_op_key) + len(remote_op_key) :] + + # Ideally, we should validate that the sum of remote operations within + # record_function are less than record_function's CPU time. However, + # there is a known bug in profiling + # (https://github.com/pytorch/pytorch/issues/45160) due to which we + # can't do this. So, we just validate they are child events. + prof.key_averages() + + # cpu_children only returns direct children, so here we get all + # children recursively. + def get_cpu_children(event): + if not event.cpu_children: + return [] + cpu_children = event.cpu_children + for e in event.cpu_children: + cpu_children.extend(get_cpu_children(e)) + return cpu_children + + record_function_children_names = [ + convert_remote_to_local(c.name) + for c in get_cpu_children(record_function_remote_event) + ] + for evt in remaining_remote_events: + local_name = convert_remote_to_local(evt.name) + if local_name not in remote_events_denylist: + self.assertTrue( + local_name in record_function_children_names, + f"{local_name} not in {record_function_children_names}", + ) def validate_profiling_workload(self, dst, prof): REMOTE_OP_STR = "#remote_op: " From 92ebb04f9206882e6d312a8b91318545f43a53c2 Mon Sep 17 00:00:00 2001 From: Himangshu Date: Thu, 24 Sep 2020 16:24:36 -0700 Subject: [PATCH 113/449] added check for NumberType (#44375) Summary: Fixes https://github.com/pytorch/pytorch/issues/44107 Pull Request resolved: https://github.com/pytorch/pytorch/pull/44375 Reviewed By: mrshenli Differential Revision: D23906728 Pulled By: eellison fbshipit-source-id: 3b534e5dd3af1f5e43a7314953e64117cbe8ffe4 --- torch/csrc/jit/frontend/ir_emitter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index 690e52d7131d..99ce4140c58a 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -2189,13 +2189,13 @@ struct to_ir { NamedValue emitValueToTensor( const NamedValue& value, const NamedValue& matchTypeOf) { - // Add implicit conversion of int/float/bool types to tensors + // Add implicit conversion of int/float/bool/number types to tensors // Used in emitSubscriptAssign to convert: // `tensor(...)[x] = 99` to `tensor(...)[x] = tensor(99)` // Mirrors the `valueToTensor` behavior in python_variable_indexing.cpp const auto kind = value.type()->kind(); - if (kind == c10::TypeKind::IntType || kind == c10::TypeKind::BoolType || - kind == c10::TypeKind::FloatType) { + if (kind == c10::TypeKind::NumberType || kind == c10::TypeKind::IntType || + kind == c10::TypeKind::BoolType || kind == c10::TypeKind::FloatType) { auto dtype = graph->insert(prim::dtype, {matchTypeOf}, {}); auto device = graph->insert(prim::device, {matchTypeOf}, {}); auto converted = graph->insert( From 0b6e5ad4a92636ec82fb103b82303785c078407a Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Thu, 24 Sep 2020 16:38:14 -0700 Subject: [PATCH 114/449] Resolve comments in #44354. (#45150) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45150 Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23846796 Pulled By: ailzhang fbshipit-source-id: 7bef89d833848ac3f8993c4c037acf1d4f2ca674 --- aten/src/ATen/core/boxing/KernelFunction.cpp | 1 + aten/src/ATen/core/dispatch/OperatorEntry.cpp | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index b5d552e0e31c..f84352ebee1f 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) { void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) { TORCH_INTERNAL_ASSERT(0, op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. " + "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). " "If it's intended to override Math kernel behavior, please open an issue to request a dedicated " "Autograd dispatch key for the backend."); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 5fa379e40710..0942659d2960 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat } bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const { - for (auto k : ks) { - if (kernels_.find(k) != kernels_.end()) { - return true; - } + TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end()); + for (auto& kv : kernels_) { + if (ks.has(kv.first)) return true; } return false; } @@ -196,6 +195,9 @@ std::pair OperatorEntry::computeDispatchTab // In the past we directly call into backends(filled with catchAll) after BackendSelect. // Now that we first call Autograd backend keys after BackendSelect, we should fill those // with catchAll as well. + // The implementation of (2.1) & (2.3) relies on the invariant that for a given backend, + // `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the + // backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_] // (3) Use fallthrough kernel that are registered as fallback. // (4) Use catchAll kernel if available // Alias Key Precedence: @@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) { updateDispatchTableEntry_(dispatcher, k); } - // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2. + // Note [Refresh Runtime Autograd entries in dispatchTable_] + // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3). DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key); updateDispatchTableEntry_(dispatcher, autograd_key); } From 677a59dcaa72fbc91abfe01731a41e0849e81154 Mon Sep 17 00:00:00 2001 From: Daya Khudia Date: Thu, 24 Sep 2020 17:19:08 -0700 Subject: [PATCH 115/449] [aten] Call fbgemm functions for embedding prepack/unpack (#44845) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44845 fbgemm functions are vectorized and faster ``` Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/6473924484856786 Summary (total time 15.08s): PASS: 7 FAIL: 0 SKIP: 0 FATAL: 0 TIMEOUT: 0 OMIT: 0 ``` Performance Before: ``` # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 68.727 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 131.500 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 248.190 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 172.742 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 333.008 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 652.423 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 167.282 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 398.901 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 785.254 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 122.653 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 230.617 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 408.807 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 176.087 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 337.514 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 659.716 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 342.529 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 665.197 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 1307.923 ``` Performance After: ``` # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 10.782 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 17.443 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 25.898 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 13.903 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 18.575 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 30.650 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 14.158 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 19.818 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 30.852 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 47.596 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 91.025 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 131.425 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 12.637 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 20.856 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 33.944 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 21.181 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 34.213 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 59.622 ``` ghstack-source-id: 112836216 Test Plan: buck test //caffe2/test:quantization -- 'test_embedding_bag*' --print-passing-details Reviewed By: radkris-git Differential Revision: D23675777 fbshipit-source-id: 0b1a787864663daecc7449295f9ab6264eac52fc --- .../quantized/cpu/qembeddingbag_prepack.cpp | 118 ++++++++++-------- .../quantized/cpu/qembeddingbag_unpack.cpp | 17 ++- 2 files changed, 83 insertions(+), 52 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index 6c67b6cc6c86..96d592594d04 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -104,8 +104,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { embedding_rows, embedding_cols + 8}; // extra 8 bytes to store FP scale and zero_point per row. - size_t output_columns = output_shape[1]; - constexpr float kEpsilon = 1e-8f; // Allocate output packed weights auto output = at::empty( @@ -114,6 +112,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { weight_contig.suggest_memory_format()); auto* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat( + weight_data, embedding_rows, embedding_cols, output_data); +#else + size_t output_columns = output_shape[1]; + constexpr float kEpsilon = 1e-8f; for (std::size_t row = 0; row < embedding_rows; ++row) { const float* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; @@ -134,6 +138,8 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { lrintf((input_row[col] - minimum_element) * inverse_scale); } // embedding_cols } // embedding_rows +#endif // USE_FBGEMM + return output; } @@ -175,57 +181,69 @@ Tensor _qembeddingbag_nbit_prepack_helper( weight_contig.options().dtype(at::kByte), weight_contig.suggest_memory_format()); auto* output_data = output.data_ptr(); - const auto output_columns = output.size(output.dim() - 1); - - for (int row = 0; row < embedding_rows; ++row) { - const float* input_row = weight_data + row * embedding_cols; - std::uint8_t* output_row = output_data + row * output_columns; - float Xmin, Xmax; - if (optimized_qparams) { - std::tie(Xmax, Xmin) = at::choose_qparams_optimized( - weight_contig[row], embedding_cols, 200, 0.16, bit_width); - } else { - Xmin = *std::min_element(input_row, input_row + embedding_cols); - Xmax = *std::max_element(input_row, input_row + embedding_cols); - } - Xmin = static_cast(Xmin); - float range = Xmax - Xmin; - // Set scale to 1.0f for the corner case of Xmax == Xmin . - // Any non-zero scale would work because during quantization - // (X - Xmin) / scale will be 0 for all X unless scale is 0. - at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1); - float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; - if (scale == 0 || std::isinf(inverse_scale)) { - // Corner case handling when Xmax == Xmin - // Any scale would work because X - Xmin will be 0 for all X - scale = 1.0f; - inverse_scale = 1.0f; - } - // Update the scale and zero_point of each row. - at::Half* output_row_scale_zp = reinterpret_cast( - output_row + - (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - - output_row_scale_zp[0] = scale; - output_row_scale_zp[1] = Xmin; - - // Pack the weight values. - for (int col = 0; col < embedding_cols; ++col) { - float X = input_row[col]; - std::uint8_t quantized = std::max( - 0, - std::min(lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1)); - // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits - // and index 1 is packed in the upper 4-bits. - if (col % NUM_ELEM_PER_BYTE == 0) { - output_row[col / NUM_ELEM_PER_BYTE] = quantized; +#ifdef USE_FBGEMM + if (!optimized_qparams) { + fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf( + bit_width, weight_data, embedding_rows, embedding_cols, output_data); + } else { +#endif // USE_FBGEMM + const auto output_columns = output.size(output.dim() - 1); + + for (int row = 0; row < embedding_rows; ++row) { + const float* input_row = weight_data + row * embedding_cols; + std::uint8_t* output_row = output_data + row * output_columns; + + float Xmin, Xmax; + if (optimized_qparams) { + std::tie(Xmax, Xmin) = at::choose_qparams_optimized( + weight_contig[row], embedding_cols, 200, 0.16, bit_width); } else { - output_row[col / NUM_ELEM_PER_BYTE] |= - (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width)); + Xmin = *std::min_element(input_row, input_row + embedding_cols); + Xmax = *std::max_element(input_row, input_row + embedding_cols); } - } // embedding_cols - } // embedding_rows + Xmin = static_cast(Xmin); + float range = Xmax - Xmin; + // Set scale to 1.0f for the corner case of Xmax == Xmin . + // Any non-zero scale would work because during quantization + // (X - Xmin) / scale will be 0 for all X unless scale is 0. + at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1); + float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; + if (scale == 0 || std::isinf(inverse_scale)) { + // Corner case handling when Xmax == Xmin + // Any scale would work because X - Xmin will be 0 for all X + scale = 1.0f; + inverse_scale = 1.0f; + } + // Update the scale and zero_point of each row. + at::Half* output_row_scale_zp = reinterpret_cast( + output_row + + (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); + + output_row_scale_zp[0] = scale; + output_row_scale_zp[1] = Xmin; + + // Pack the weight values. + for (int col = 0; col < embedding_cols; ++col) { + float X = input_row[col]; + std::uint8_t quantized = std::max( + 0, + std::min( + lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1)); + // We pack 2 4-bit values in a byte. Index 0 is packed in the lower + // 4-bits and index 1 is packed in the upper 4-bits. + if (col % NUM_ELEM_PER_BYTE == 0) { + output_row[col / NUM_ELEM_PER_BYTE] = quantized; + } else { + output_row[col / NUM_ELEM_PER_BYTE] |= + (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width)); + } + } // embedding_cols + } // embedding_rows +#ifdef USE_FBGEMM + } +#endif // USE_FBGEMM + return output; } diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp index 72d42c61d0e5..4a9ae73ee137 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp @@ -73,6 +73,10 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) { packed_weight.suggest_memory_format()); float* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat( + input, input_rows, input_columns, output_data); +#else for (std::size_t row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const float* input_row_scale_zp = @@ -84,14 +88,17 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) { input_row[col] * input_row_scale_zp[0] + input_row_scale_zp[1]; } // output_columns } // input_rows +#endif // USE_FBGEMM return output; } -Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) { +Tensor _qembeddingbag_nbit_unpack_helper( + const Tensor& packed_weight, + int BIT_RATE) { const auto input_rows = packed_weight.size(0); const auto input_columns = packed_weight.size(1); const auto* input_data = packed_weight.data_ptr(); - int NUM_ELEM_PER_BYTE = 8/BIT_RATE; + int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; // The last 4 bytes per row are two fp16 scale and zero_point. // The rest of input_columns is the number of values in the original row. @@ -105,6 +112,10 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA packed_weight.options().dtype(kFloat), packed_weight.suggest_memory_format()); float* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat( + BIT_RATE, input_data, input_rows, input_columns, output_data); +#else auto output_columns = output_dimensions[1]; for (size_t row = 0; row < input_rows; ++row) { float* output_row = output_data + row * output_columns; @@ -122,6 +133,8 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA output_row[col] = scale * quantized + zero_point; } // output_columns } // input_rows +#endif // USE_FBGEMM + return output; } From 03dde4c62af35a8a8a0c2e1ea9f6486ac897a780 Mon Sep 17 00:00:00 2001 From: Dianshi Li Date: Thu, 24 Sep 2020 18:39:54 -0700 Subject: [PATCH 116/449] Resend diff D23858329 (#45315) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45315 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45314 in D23858329 (https://github.com/pytorch/pytorch/commit/721cfbf8425cf2c1dc5e27d1332e32e1a42ef541), we put PriorCorrectionCalibrationPrediction unit test in OSS file which causes test failure issue in public trunk. this diff moves it to FB only test file. Test Plan: ``` buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_gather_ranges_to_dense_op buck test //caffe2/caffe2/fb/python/operator_test:torch_integration_test -- test_prior_correct_calibration_prediction_op ``` all pass. Reviewed By: houseroad Differential Revision: D23899012 fbshipit-source-id: 1ed97d8702e2765991e6caf5695d4c49353dae82 --- caffe2/operators/gather_ranges_to_dense_op.cc | 8 ++++ caffe2/operators/gather_ranges_to_dense_op.h | 3 ++ .../operator_test/torch_integration_test.py | 41 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc index 10396aafc97e..aa31ef12b36a 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.cc +++ b/caffe2/operators/gather_ranges_to_dense_op.cc @@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense); } // namespace } // namespace caffe2 + +using GatherRangesToDenseCPUOp = + caffe2::GatherRangesToDenseOp; + +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( + GatherRangesToDense, + "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs", + GatherRangesToDenseCPUOp); diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index c1dd5a527005..217a61b25129 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -5,6 +5,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" +#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -15,6 +16,8 @@ #include #include +C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense); + namespace caffe2 { template class GatherRangesToDenseOp final : public Operator { diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index 55f26a89987f..9bec64764240 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -875,6 +875,47 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries): ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) + def test_gather_ranges_to_dense_op(self): + data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) + ranges = np.array([[[2, 4]], [[0, 0]]]) + key = np.array([0, 1, 3, 2, 1, 0, 1, 0]) + lengths = np.array([4]) + min_observation = 2 + max_mismatched_ratio = 0.5 + max_empty_ratio = 1.0 + + outputs_name = ["X_{}".format(i) for i in range(len(lengths))] + ref_op = core.CreateOperator( + "GatherRangesToDense", + ["data", "ranges", "key"], + outputs_name, + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + workspace.FeedBlob("data", data) + workspace.FeedBlob("ranges", ranges) + workspace.FeedBlob("key", key) + workspace.RunOperatorOnce(ref_op) + ref_outputs = [] + for output_name in outputs_name: + ref_outputs.append(workspace.FetchBlob(output_name)) + + outputs = torch.ops._caffe2.GatherRangesToDense( + torch.from_numpy(data), + torch.from_numpy(ranges), + torch.from_numpy(key), + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + + self.assertEqual(len(ref_outputs), len(outputs)) + for i in range(0, len(ref_outputs)): + np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy()) + @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10)) @settings(deadline=1000) def test_merge_id_lists(self, lengths_0, lengths_1): From 0f2c648c970d33fe7cc6a8198e9ce59a584ae734 Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Thu, 24 Sep 2020 20:06:42 -0700 Subject: [PATCH 117/449] log metadata when model loading failed (#44430) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44430 log metadata even when model loading is failed Test Plan: {F331550976} Reviewed By: husthyc Differential Revision: D23577711 fbshipit-source-id: 0504e75625f377269f1e5df0f1ebe34b8e564c4b --- torch/csrc/jit/mobile/import.cpp | 19 +++++++++++++++---- torch/csrc/jit/mobile/observer.h | 3 +++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index e812fd978c9f..e26177605674 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -228,6 +228,8 @@ class BytecodeDeserializer final { public: explicit BytecodeDeserializer(std::unique_ptr reader); mobile::Module deserialize(c10::optional device); + std::unordered_map deserializeMetadata( + c10::optional device); private: c10::IValue readArchive( @@ -246,6 +248,13 @@ BytecodeDeserializer::BytecodeDeserializer( : compilation_unit_(std::make_shared()), reader_(std::move(reader)) {} +std::unordered_map BytecodeDeserializer:: + deserializeMetadata(c10::optional device) { + device_ = device; + auto mcu = std::make_shared(); + return readMobileMetadata(mcu); +} + mobile::Module BytecodeDeserializer::deserialize( c10::optional device) { device_ = device; @@ -397,9 +406,9 @@ mobile::Module _load_for_mobile( if (observer) { observer->onEnterLoadModel(); } + auto reader = torch::make_unique(std::move(rai)); + BytecodeDeserializer deserializer(std::move(reader)); try { - auto reader = torch::make_unique(std::move(rai)); - BytecodeDeserializer deserializer(std::move(reader)); mobile::Module result = deserializer.deserialize(std::move(device)); std::unordered_map copied_metadata = result.metadata(); @@ -412,7 +421,8 @@ mobile::Module _load_for_mobile( return result; } catch (c10::Error& error) { if (observer) { - observer->onFailLoadModel(error.what()); + observer->onFailLoadModel( + error.what(), deserializer.deserializeMetadata(std::move(device))); } TORCH_RETHROW(error); } catch (...) { @@ -429,7 +439,8 @@ mobile::Module _load_for_mobile( } } catch (c10::Error& error) { if (observer) { - observer->onFailLoadModel(error.what()); + observer->onFailLoadModel( + error.what(), deserializer.deserializeMetadata(std::move(device))); } TORCH_RETHROW(error); } diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h index fde99f501f72..2935fa078fc7 100644 --- a/torch/csrc/jit/mobile/observer.h +++ b/torch/csrc/jit/mobile/observer.h @@ -78,6 +78,9 @@ class MobileModuleObserver { virtual void onExitLoadModel( const std::unordered_map&) {} virtual void onFailLoadModel(const char*) {} + virtual void onFailLoadModel( + const char*, + const std::unordered_map&) {} }; class MobileObserverConfig { From 7e5492e1bedef05752f8c8961d8bcc1a7e5f641e Mon Sep 17 00:00:00 2001 From: Xiao Wang <24860335+xwang233@users.noreply.github.com> Date: Thu, 24 Sep 2020 20:09:47 -0700 Subject: [PATCH 118/449] [minor] Fix undefined variable (#45246) Summary: The commit https://github.com/pytorch/pytorch/commit/2a37f3fd2f74e2d10f3440e6dfef2d5389caab62 https://github.com/pytorch/pytorch/pull/45130 deleted the python variable `capability` which is used in later lines. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45246 Reviewed By: walterddr Differential Revision: D23923916 Pulled By: malfet fbshipit-source-id: c5d7fef9e4a87ccc621191200e5965710e9d6aaa --- torch/cuda/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index e8687cad17e8..1176c6ee3060 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -100,6 +100,7 @@ def _check_cubins(): supported = any([sm // 10 == cap_major for sm in supported_sm]) if not supported: device_name = get_device_name(idx) + capability = cap_major * 10 + cap_minor warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name)) From 630bd85aae958495682fb5959f5a97832c2223d7 Mon Sep 17 00:00:00 2001 From: Jiakai Liu Date: Thu, 24 Sep 2020 20:15:31 -0700 Subject: [PATCH 119/449] [pytorch] refine dispatch keys in native_functions.yaml (2/N) (#45284) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45284 This is the 2nd batch of the change described in #45010. In this batch we relaxed some filters to cover more 'backend specific' ops: * ops that not call any 'Tensor::is_xxx()' method OR only call 'Tensor::is_cuda()' - we are adding CUDA dispatch key anyway; * ops that call other ATen ops but ARE differentiable - differentiability is a fuzzy indicator of not being 'composite'; Inherited other filters from the 1st batch: * These ops don't already have dispatch section in native_functions.yaml; * These ops call one or more DispatchStub (thus "backend specific"); Differential Revision: D23909901 Test Plan: Imported from OSS Reviewed By: ailzhang Pulled By: ljk53 fbshipit-source-id: 3b31e176324b6ac814acee0b0f80d18443bd81a1 --- aten/src/ATen/native/native_functions.yaml | 148 +++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index f5bbb263ed9c..0d5582572d6e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -226,6 +226,8 @@ variants: function, method - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: abs_out # Note [Adding an alias] # To add an alias do the following: @@ -268,6 +270,8 @@ variants: function, method - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -285,6 +289,8 @@ variants: method - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sgn_out - func: real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -425,8 +431,12 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: all - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -440,8 +450,12 @@ - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: any - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -688,9 +702,13 @@ - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method @@ -900,6 +918,8 @@ variants: function, method - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor use_c10_dispatcher: full @@ -910,6 +930,8 @@ variants: function, method - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor use_c10_dispatcher: full @@ -920,6 +942,8 @@ variants: function, method - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_min_out # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor @@ -1811,6 +1835,8 @@ - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function, method + dispatch: + CPU, CUDA: index # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef indices) @@ -1843,6 +1869,8 @@ - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: _index_put_impl_ - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor use_c10_dispatcher: full @@ -2142,6 +2170,8 @@ - func: matrix_exp(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor use_c10_dispatcher: full @@ -2171,6 +2201,8 @@ variants: function, method - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: max_out - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2187,6 +2219,8 @@ variants: function, method - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amax_out # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) @@ -2258,6 +2292,8 @@ variants: function, method - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2269,6 +2305,8 @@ variants: function, method - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amin_out - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor use_c10_dispatcher: full @@ -2584,18 +2622,26 @@ - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full @@ -2899,10 +2945,14 @@ - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3191,27 +3241,39 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sum_out - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor use_c10_dispatcher: full @@ -3241,23 +3303,33 @@ - func: std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3267,12 +3339,18 @@ - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -3428,6 +3506,8 @@ variants: function, method - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: trunc_out # Alias for trunc - func: fix(Tensor self) -> Tensor @@ -3506,12 +3586,18 @@ - func: var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3521,10 +3607,14 @@ - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function @@ -3560,6 +3650,8 @@ - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor use_c10_dispatcher: full @@ -3720,8 +3812,12 @@ variants: function, method - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method @@ -3830,6 +3926,8 @@ - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: rsub - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -4279,6 +4377,8 @@ - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4287,6 +4387,8 @@ - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4295,6 +4397,8 @@ - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4303,6 +4407,8 @@ - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4999,6 +5105,8 @@ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method @@ -5037,10 +5145,14 @@ device_guard: False - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: cross - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5711,6 +5823,8 @@ - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: digamma - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5782,6 +5896,8 @@ - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: atan2 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5906,8 +6022,12 @@ - func: maximum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: maximum - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: maximum_out # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max @@ -5920,8 +6040,12 @@ - func: minimum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: minimum - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: minimum_out # binary min, alias for minimum # NOTE: min is not an alias for minimum, since there is also unary min @@ -6002,6 +6126,8 @@ - func: all(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: all - func: any(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6077,18 +6203,32 @@ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: normal_ - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -6396,10 +6536,14 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6562,6 +6706,8 @@ - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss - func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7603,6 +7749,8 @@ - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: sigmoid_backward - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn From c6500bcf1494aadf7bd86adb554fdad376b7f105 Mon Sep 17 00:00:00 2001 From: Yanli Zhao Date: Thu, 24 Sep 2020 20:52:17 -0700 Subject: [PATCH 120/449] [reland] Make grad point to bucket buffer in DDP to save memory usage (#44344) Summary: [test all] Pull Request resolved: https://github.com/pytorch/pytorch/pull/44344 reland #41954 Add one argument in DDP API to enable/disable letting grads pointing to views. When it is disabled, behavior is the same as DDP right now; when it is enabled, Make both variable.grad() and grad in distautograd context point to bucket buffer in DDP to save memory usage. In this case, grad will be view of bucket buffer tensors, in order to make it compatiable with optimizer.zero_grad(), we made changes in #41283. Also be noted that we can not make variable.grad() pointing to bucket buffer during construction time, because we want to keep grad undefined for unused parameters. ghstack-source-id: 112845787 Test Plan: 1. When grad_is_view=false: a. roberta_base, peak memory usage 8250MB, p50 per iteration latency 0.923second, https://www.internalfb.com/intern/fblearner/details/218029699/?notif_channel=cli b. resnet, peak memory usage 3089MB, p50 per iteration latency 0.120second, https://www.internalfb.com/intern/fblearner/details/218029035/?notif_channel=cli c. accuracy benchmark, distributed=false, .accuracy 40.914535522461, .loss: 1.6370717287064; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588 https://www.internalfb.com/intern/fblearner/details/218035688/?notif_channel=cli d. classy vision uru production flow, https://www.internalfb.com/intern/fblearner/details/219065811/?notif_channel=cli e. pytext flow, https://www.internalfb.com/intern/fblearner/details/219137458/?notif_channel=cli 2. When grad_is_view=true: a. roberta_base, peak memory usage 7183MB, p50 per iteration latency 0.908second, https://www.internalfb.com/intern/fblearner/details/217882539?tab=operator_details b. resnet, peak memory usage 2988 MB, p50 per iteration latency 0.119second, https://www.internalfb.com/intern/fblearner/details/218028479/?notif_channel=cli c. accuracy benchmark, distributed=false, .accuracy 41.713260650635, .loss: 1.69939661026; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588, https://www.internalfb.com/intern/fblearner/details/218037058/?notif_channel=cli d. classy vision uru production flow, expected, can not work well with apex.amp https://www.internalfb.com/intern/fblearner/details/219205218/?notif_channel=cli e. pytext flow, detach_() related error, expected, as pytext zero_grad depends on apex repo where detach_() is called. also seeing the warning in finalize_bucket_dense due to tied weights, which is expected. https://www.internalfb.com/intern/fblearner/details/219150229/?notif_channel=cli Reviewed By: mrshenli Differential Revision: D23588186 fbshipit-source-id: f724d325b954ef6f06ede31759bf01dd29a6f5e5 --- test/distributed/test_c10d.py | 180 +++++++++---- torch/csrc/autograd/VariableTypeManual.cpp | 7 +- .../csrc/autograd/functions/accumulate_grad.h | 5 + torch/csrc/distributed/c10d/init.cpp | 2 + torch/csrc/distributed/c10d/reducer.cpp | 239 +++++++++++++----- torch/csrc/distributed/c10d/reducer.h | 17 +- torch/nn/parallel/distributed.py | 29 ++- .../_internal/distributed/distributed_test.py | 66 ++++- 8 files changed, 425 insertions(+), 120 deletions(-) diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 64e255fce3e6..a81bc53f175a 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -1974,13 +1974,15 @@ def tearDown(self): def world_size(self): return 2 - def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_single_device_module( + self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): model = Net() ddp_model = DistributedDataParallel( copy.deepcopy(model).to(devices[0]), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) model.to(devices[0]) @@ -1989,7 +1991,7 @@ def _prepare_single_device_module(self, process_group, devices, device_ids, glob return model, ddp_model, input, target - def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): self.assertTrue( len(devices) == 2 or len(devices) == 4, "unexpected devices for ddp tests {}".format(devices)) @@ -2002,14 +2004,15 @@ def _prepare_multi_device_module(self, process_group, devices, device_ids, globa copy.deepcopy(model), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) input = torch.randn(global_batch_size, 2).cuda(devices[0]) target = torch.randn(global_batch_size, 4) return model, ddp_model, input, target - def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False): + def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): """ Note: we pass down `device_ids` all the way to DistributedDataParallel as part of the test. Below you find tests that either use a list of @@ -2023,11 +2026,11 @@ def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi if multi_device: model, ddp_model, input, target = \ self._prepare_multi_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) else: model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2062,17 +2065,21 @@ def update_parameters(model): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] - def _test_gloo_backend(self, devices, device_ids, multi_device=False): + def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_gloo() def test_gloo_backend_cpu_module(self): self._test_gloo_backend([torch.device("cpu")], []) + @requires_gloo() + def test_gloo_backend_cpu_module_grad_is_view(self): + self._test_gloo_backend([torch.device("cpu")], [], gradient_as_bucket_view=True) + @requires_gloo() @skip_if_not_multigpu def test_gloo_backend_1gpu_module_device_ids_integer_list(self): @@ -2101,10 +2108,10 @@ def test_gloo_backend_4gpu_module(self): devices = [torch.device("cuda:" + str(i)) for i in int_devices] self._test_gloo_backend(devices, [], multi_device=True) - def _test_nccl_backend(self, devices, device_ids, multi_device=False): + def _test_nccl_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_nccl() @skip_if_not_multigpu @@ -2169,10 +2176,7 @@ def test_ddp_multi_device_module_config(self): ddp_model = DistributedDataParallel( model, device_ids=gpus, process_group=process_group) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_fp16(self): + def _test_fp16(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2184,6 +2188,7 @@ def test_fp16(self): device_ids=[gpus[0]], process_group=process_group, bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view ) # Input 2**15, so that the gradients will overflow with a @@ -2204,7 +2209,16 @@ def test_fp16(self): @requires_nccl() @skip_if_not_multigpu @skip_if_rocm - def test_arbitrary_forward_return_value(self): + def test_fp16(self): + self._test_fp16() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_fp16_grad_is_view(self): + self._test_fp16(gradient_as_bucket_view=True) + + def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2240,6 +2254,7 @@ def forward(self, x, fn): ForwardReturnValueModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2295,7 +2310,16 @@ def test(box, unbox): @requires_nccl() @skip_if_not_multigpu @skip_if_rocm - def test_find_unused_parameters_kwarg(self): + def test_arbitrary_forward_return_value(self): + self._test_arbitrary_forward_return_value() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_arbitrary_forward_return_value_grad_is_view(self): + self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) + + def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2325,12 +2349,13 @@ def forward(self, x): input = torch.rand([batch_size, 2], dtype=torch.float) target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id) - def test_find_unused_parameters(find_unused_parameters, test_default=False): + def test_find_unused_parameters(find_unused_parameters, test_default=False, gradient_as_bucket_view=False): if test_default: model = DistributedDataParallel( FindUnusedParametersModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) else: model = DistributedDataParallel( @@ -2338,6 +2363,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): device_ids=[device_id], process_group=process_group, find_unused_parameters=find_unused_parameters, + gradient_as_bucket_view=gradient_as_bucket_view, ) output, fc3 = model(input) @@ -2349,7 +2375,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # trigger an error when `backward` is called (because fc3 is an unused # parameter and will therefore be marked ready twice). try: - test_find_unused_parameters(True) + test_find_unused_parameters(True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.assertTrue( str(ex).startswith("Expected to mark a variable ready only once.")) @@ -2359,19 +2385,29 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # Then test that the default behavior can be overridden by setting # `find_unused_parameters=False`. try: - test_find_unused_parameters(False) + test_find_unused_parameters(False, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) # Test find_unused_parameters defaults to False try: - test_find_unused_parameters(True, test_default=True) + test_find_unused_parameters(True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_global_local_unused_params_grad(self): + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg_grad_is_view(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False): """ By simulating a multi-task training, this test is to make sure: 1) DDP does not touch the grad of globally unused parameters. @@ -2417,6 +2453,7 @@ def run_and_verify_grad(model): GlobalLocalUnusedParamModule().cpu(), process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(cpu_model) @@ -2427,9 +2464,20 @@ def run_and_verify_grad(model): device_ids=[device_id], process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(gpu_model) + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad(self): + self._test_global_local_unused_params_grad() + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad_with_grad_is_view(self): + self._test_global_local_unused_params_grad(gradient_as_bucket_view=True) + @requires_gloo() @skip_if_lt_x_gpu(2) def test_find_unused_parameters_when_unused_parameters_empty(self): @@ -2486,10 +2534,7 @@ def run_and_verify_grad(model): ) run_and_verify_grad(gpu_model) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_multiple_outputs_multiple_backward(self): + def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2523,6 +2568,7 @@ def forward(self, x): MultipleOutputModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2537,6 +2583,18 @@ def forward(self, x): loss2 = criterion(output2, target) loss2.backward() + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward(self): + self._test_multiple_outputs_multiple_backward() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward_grad_is_view(self): + self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_not_multigpu @skip_if_rocm @@ -2586,7 +2644,7 @@ def check_no_grads(): # No parameter should have their gradient set. check_no_grads() - def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): + def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False): """ This is the recommended way to implement accumulate grads. If ``ddp_comm_hook`` input was specified, it will also register that hook @@ -2601,7 +2659,7 @@ def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): local_batch_size = len(devices) model, ddp_model, input, target = self._prepare_single_device_module( - process_group, devices, devices, global_batch_size + process_group, devices, devices, global_batch_size, gradient_as_bucket_view ) if ddp_comm_hook is not None: @@ -2658,6 +2716,15 @@ def test_accumulate_gradients_no_sync(self): """ self._test_accumulate_gradients_no_sync() + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_no_sync_grad_is_view(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs + """ + self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_not_multigpu @skip_if_rocm @@ -2708,10 +2775,7 @@ def div(fut): num_iters=4, ddp_comm_hook=allreduce_with_then_hook ) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_accumulate_gradients_module(self): + def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): # This is NOT the recommended way to implement accumulating grads, but # we would like to make sure DDP does not mess up with the underlying # module. @@ -2723,7 +2787,7 @@ def test_accumulate_gradients_module(self): model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, devices, global_batch_size) + process_group, devices, devices, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2763,6 +2827,18 @@ def step_model(model, input, target): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module(self): + self._test_accumulate_gradients_module() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module_with_grad_is_view(self): + self._test_accumulate_gradients_module(gradient_as_bucket_view=True) + @requires_gloo() def test_ignored_output(self): """ @@ -3022,8 +3098,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): ddp_parameter = next(ddp_model.parameters()) self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) - @requires_gloo() - def test_sparse_gradients(self): + def _test_sparse_gradients(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) @@ -3034,10 +3109,19 @@ def test_sparse_gradients(self): ddp_model = DistributedDataParallel( copy.deepcopy(vanilla_model), process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) + @requires_gloo() + def test_sparse_gradients(self): + self._test_sparse_gradients() + + @requires_gloo() + def test_sparse_gradients_grad_is_view(self): + self._test_sparse_gradients(gradient_as_bucket_view=True) + def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3206,12 +3290,13 @@ def test_ddp_comm_hook_future_passing_cpu(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2)) - def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None): + def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False): device_id = gpus_for_rank(self.world_size)[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) # Register DDP Communication Hook if defined @@ -3276,10 +3361,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) - @requires_nccl() - @skip_if_lt_x_gpu(2) - @skip_if_rocm - def test_ddp_comm_hook_allreduce_hook_nccl(self): + def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False): """ This unit test verifies whether a DDP communication hook that just calls allreduce gives the same result result with the case of no hook registered. @@ -3294,11 +3376,23 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future: return process_group.allreduce(tensors).get_future() # Get GPU model with allreduce_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook) + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook, gradient_as_bucket_view) # check whether the grads are equal to what DDP without hook would return. self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl(self): + self._test_ddp_comm_hook_allreduce_hook_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self): + self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_lt_x_gpu(2) @skip_if_rocm diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index c72c67eb5230..18e5e4f54820 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -269,7 +269,12 @@ Tensor & detach_(Tensor & self) { "of detach_(). Alternatively, create this view with an " "`unsafe_` version of the function that produced it."); } else { - AT_ERROR("Can't detach views in-place. Use detach() instead"); + AT_ERROR("If you are using DistributedDataParallel (DDP) for training, " + "and gradient_as_bucket_view is set as True, gradients are " + "views of DDP buckets, and hence detach_() cannot be called " + "on these gradients. To fix this error, please refer to the " + "Optimizer.zero_grad() function in torch/optim/optimizer.py " + "as the solution."); } } // I think the choice here is conservative. In principle, doing diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index e1a02dc19fd8..dafd07f64b84 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -161,6 +161,11 @@ struct TORCH_API AccumulateGrad : public Node { // valid operation which adds `new_grad` to `variable_grad` in // place. `variable_grad` is thus still referring to the same tensor // after the operation. + // Also DistributedDataParallel(DDP) package relies on grad being + // mutated in place for saving peak memory usage. DDP will still + // work correctly if it is mutated out of place here, but DDP will + // maintain one extra copy of grad tensors in buffer and thus + // increase peak memory usage. variable_grad += new_grad; CHECK_RESULT(variable_grad, variable); // ^ We could enforce the contract more aggressively here by writing: diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index aff2da31c133..165d6a1c8603 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -159,6 +159,7 @@ PyObject* c10d_init(PyObject* _unused) { std::shared_ptr<::c10d::ProcessGroup>, std::vector>, int64_t, + bool, bool>(), py::arg("replicas"), py::arg("bucket_indices"), @@ -166,6 +167,7 @@ PyObject* c10d_init(PyObject* _unused) { py::arg("expect_sparse_gradients") = std::vector>(), py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap, py::arg("find_unused_parameters") = false, + py::arg("gradient_as_bucket_view") = false, py::call_guard()) .def( "initialize_buckets", diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 1a5766eea84e..86916c7994dd 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -32,7 +32,8 @@ Reducer::Reducer( std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters) + bool find_unused_parameters, + bool gradient_as_bucket_view) : replicas_(std::move(replicas)), process_group_(std::move(process_group)), expect_sparse_gradients_(std::move(expect_sparse_gradients)), @@ -41,6 +42,7 @@ Reducer::Reducer( next_bucket_(0), has_marked_unused_parameters_(false), find_unused_parameters_(find_unused_parameters), + gradient_as_bucket_view_(gradient_as_bucket_view), local_used_maps_reduced_(false), backward_stats_base_(0), has_rebuilt_bucket_(false), @@ -310,6 +312,56 @@ void Reducer::verify_replica0_across_processes() { } } +void Reducer::check_grad_layout( + const at::Tensor& grad, + const at::Tensor& bucket_view) { + // Ensure that the gradient type matches the bucket type. + TORCH_CHECK( + grad.options().type_equal(bucket_view.options()), + "Expected ", + bucket_view.toString(), + ", got ", + grad.toString()); + TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); + TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); + // AccumulateGrad doesn't HAVE to obey the grad layout contract. + // The penalty for disobedience is reduced performance, not numerical + // death. Warnings here help diagnose poor DDP performance. + if (grad.strides() != bucket_view.strides()) { + TORCH_WARN_ONCE( + "Grad strides do not match bucket view strides. " + "This may indicate grad was not created according to the " + "gradient layout contract, or that the param's strides " + "changed since DDP was constructed. This is not an error, " + "but may impair performance.\n" + "grad.sizes() = ", + grad.sizes(), + ", strides() = ", + grad.strides(), + "\n", + "bucket_view.sizes() = ", + bucket_view.sizes(), + ", strides() = ", + bucket_view.strides()); + } + if (!gradient_as_bucket_view_) { + TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); + } +} + +void Reducer::copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view) { + // See Note [DDP Communication Hook] + if (comm_hook_ == nullptr) { + // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp + auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); + wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); + // Divides while copying into the bucket view. + at::native::mul_out(bucket_view, grad, wrapped); + } else { + bucket_view.copy_(grad); + } +} + void Reducer::mark_variable_ready_dense(VariableIndex index) { const auto replica_index = index.replica_index; const auto variable_index = index.variable_index; @@ -327,49 +379,27 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // of the bucket it would otherwise hold. runGradCallbackForVariable(variable, [&](auto& grad) { if (grad.defined()) { - // Ensure that the gradient type matches the bucket type. - TORCH_CHECK( - grad.options().type_equal(bucket_view.options()), - "Expected ", - bucket_view.toString(), - ", got ", - grad.toString()); - // Assert that the grad tensor and the bucket don't share storage. - // If they did, we could avoid the copy altogether. - // The reason for not doing this is that existing code calls - // `detach_` from `zero_grad`, which is incompatible with views. - TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); - TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); - TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); - // AccumulateGrad doesn't HAVE to obey the grad layout contract. - // The penalty for disobedience is reduced performance, not numerical - // death. Warnings here help diagnose poor DDP performance. - if (grad.strides() != bucket_view.strides()) { - TORCH_WARN_ONCE( - "Grad strides do not match bucket view strides. " - "This may indicate grad was not created according to the " - "gradient layout contract, or that the param's strides " - "changed since DDP was constructed. This is not an error, " - "but may impair performance.\n" - "grad.sizes() = ", - grad.sizes(), - ", strides() = ", - grad.strides(), - "\n", - "bucket_view.sizes() = ", - bucket_view.sizes(), - ", strides() = ", - bucket_view.strides()); - } - // See Note [DDP Communication Hook] - if (comm_hook_ == nullptr) { - // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp - auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); - wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); - // Divides while copying into the bucket view. - at::native::mul_out(bucket_view, grad, wrapped); + this->check_grad_layout(grad, bucket_view); + // When gradient_as_bucket_view_ is false, or even when + // gradient_as_bucket_view_ is true, in rare cases users may set grad to + // be None after every iteration. In these cases, grad and bucket_view are + // pointing to different storages and thus need to copy grads to + // bucket_view. If gradient_as_bucket_view_ is set as true, let grad point + // to bucket_view. If grad has already been set as views of buckets in + // previous iterations, no copy is needed. + if (!grad.is_alias_of(bucket_view)) { + this->copy_grad_to_bucket(grad, bucket_view); + if (gradient_as_bucket_view_) { + // Let grad point to bucket_view buffer. + grad = bucket_view; + // The grad is modified and need to be written back. + return true; + } } else { - bucket_view.copy_(grad); + // If grad and bucket view point to the same storage, no need to copy + if (comm_hook_ == nullptr) { + bucket_view.div_(divFactor_); + } } } else { bucket_view.zero_(); @@ -674,6 +704,17 @@ void Reducer::mark_bucket_ready(size_t bucket_index) { void Reducer::initialize_buckets( std::vector> bucket_indices) { + // If initialize_buckets is called inside DDP constructor, then + // it does not matter rpc context ptr is nullptr or not, as grad + // will not be mutated. + // If initialize_buckets is called during training loop, e.g, inside + // rebuild_buckets(), since grad could be mutated and be pointed to + // bucket_view, then it needs to check rpc context ptr is nullptr or not, + // If rpc context ptr is nullptr, mutate variable.grad(); otherwise, + // mutate grad in rpc context. + using torch::distributed::autograd::ThreadLocalDistAutogradContext; + this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr()); + // This shouldn't be called if we're expecting autograd hooks to fire. TORCH_CHECK( !expect_autograd_hooks_, @@ -825,7 +866,7 @@ void Reducer::initialize_bucket_views( Reducer::BucketReplica& replica, at::Tensor& contents) { for (size_t i = 0; i < replica.variables.size(); i++) { - const auto& v = replica.variables[i]; + auto& v = replica.variables[i]; const auto offset = replica.offsets[i]; const auto length = replica.lengths[i]; if (v.is_non_overlapping_and_dense()) { @@ -844,6 +885,29 @@ void Reducer::initialize_bucket_views( // By default `bucket_views_out` and `bucket_views_in` are // essentially the same thing. replica.bucket_views_out = replica.bucket_views_in; + + // If gradient_as_bucket_view_ is set as true, then there are two cases to + // handle: initialize_bucket_views could be called inside initialize_buckets + // when rebuild_buckets, if grad has already been defined/calculated in + // previous iteration, old grad needs to be copied into new bucket_view and + // let grad point to the new bucket_view, initialize_bucket_views could also + // be called inside initialize_buckets during construction. Grads are not + // defined during construction time, in this case, do not let grad point to + // bucket_view, because grads should be kept as being undefined for globally + // unused parameters. + if (gradient_as_bucket_view_) { + auto& bucket_view = replica.bucket_views_in.back(); + runGradCallbackForVariable(v, [&](auto& grad) { + if (grad.defined() && !grad.is_alias_of(bucket_view)) { + bucket_view.copy_(grad); + grad = bucket_view; + // The grad is modefied and needs to be written back. + return true; + } + // The grad is not modified and does not need to be written back. + return false; + }); + } } } @@ -965,6 +1029,31 @@ void Reducer::prepare_for_backward( } } +void Reducer::copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused) { + const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + if (!grad.defined()) { + // Creates grad according to the "Gradient Layout Contract" + // (see torch/csrc/grad/AccumulateGrad.h) + grad = + torch::autograd::utils::clone_obey_contract(bucket_view, variable); + } else { + grad.copy_(bucket_view); + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); +} + // A bucket with one or more dense tensors needs to be unflattened. void Reducer::finalize_bucket_dense(Bucket& bucket) { for (size_t replica_index = 0; replica_index < bucket.replicas.size(); @@ -1015,24 +1104,52 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) { } } - const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; - runGradCallbackForVariable(variable, [&](auto& grad) { - // If a parameter is globally unused, we keep its grad untouched. - if (!global_unused) { - if (!grad.defined()) { - // Creates grad according to the "Gradient Layout Contract" - // (see torch/csrc/grad/AccumulateGrad.h) - grad = torch::autograd::utils::clone_obey_contract( - bucket_view, variable); - } else { - grad.copy_(bucket_view); - } - // The grad is modified and needs to be written back. - return true; + if (!gradient_as_bucket_view_) { + copy_bucket_to_grad( + variable, replica, intra_bucket_index, global_unused); + } else { + const auto& bucket_view_out = + replica.bucket_views_out[intra_bucket_index]; + auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index]; + // If communication_hook is registered, bucket_view_out stores + // allreduced results in a newly allocated tensor, copy bucket_view_out + // back to bucket_view_in that referring to replica.content tensor and + // grad. + if (!bucket_view_in.is_alias_of(bucket_view_out)) { + bucket_view_in.copy_(bucket_view_out); } - // The grad is not modified. - return false; - }); + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + // If grad is globally used but locally unused, let grad point to + // bucket_view_in + if (!grad.defined()) { + grad = bucket_view_in; + } else { + if (!grad.is_alias_of(bucket_view_in)) { + grad.copy_(bucket_view_in); + TORCH_WARN_ONCE( + "Detected at least one parameter gradient is not the " + "expected DDP bucket view when setting " + "gradient_as_bucket_view=True. This can happen when " + "multiple parameters sharing the same gradient. For " + "example, param0 and param1 share the same gradient " + "grad0. In this case, grad0 would first point to " + "bucket_view_in0 when param0 is ready. Later, when " + "param1 is ready, it will override grad0 to point to " + "bucket_view_in1. However, param0 still expects grad0 " + "to point to bucket_view_in0, and hence hit this " + "warning. If you saw this message, please double-check if " + "the above situation is expected for your application."); + } + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); + } } } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 3b441c99a3b6..960a32356acf 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -30,7 +30,8 @@ class Reducer { std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters); + bool find_unused_parameters, + bool gradient_as_bucket_view); ~Reducer() noexcept(false); @@ -124,6 +125,7 @@ class Reducer { bool has_marked_unused_parameters_; const bool find_unused_parameters_; + const bool gradient_as_bucket_view_; std::vector unused_parameters_; // Locally used parameter maps indicating if parameters are used locally // during the current iteration or no_sync session if no_sync is on. One @@ -230,6 +232,19 @@ class Reducer { // with the result of `future_work`. void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor); + // If gradient_as_bucket_view_ is false, after allreduce buckets, + // copy bucket results back to grads. + void copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused); + // Check layout of grad and bucket_view before calling copy_grad_to_bucket + void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view); + // If gradient_as_bucket_view_ is false, before allreduce buckets, + // copy grads to buckets. + void copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view); + // A bucket holds N bucket replicas (1 per model replica). // // If every bucket in this struct is ready, the reduction can be kicked off. diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 790a9d1c2fc4..5ec2b0148a21 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -316,6 +316,28 @@ class DistributedDataParallel(Module): are getting different gradients, which should not happen if DistributedDataParallel is correctly used. (default: ``False``) + gradient_as_bucket_view (bool): this is a prototype feature. When set to ``True``, + gradients will be views pointing to different offsets of + allreduce communication buckets. This can reduce peak memory + usage, where the saved memory size will be equal to the total + gradients size. Moreover, it avoids the overhead of copying + between gradients and allreduce communication buckets. + When gradients are views, ``detach_()`` cannot be called on the + gradients. If hitting such errors, please fix it by referring to + the :meth:`~torch.optim.Optimizer.zero_grad` function in + ``torch/optim/optimizer.py`` as the solution. + Warning! It is also found that ``gradient_as_bucket_view = true`` + does not work as expected when ``apex.amp`` is used for + mixed precision training. ``apex.amp`` maintained stashed gradients + that are used for unscaling gradients. These stashed gradients + are pointed to gradients (will be communication buckets when + ``gradient_as_bucket_view = true``) before starting new iteration. + In new iteration, the communication buckets are mutated and thus + these stashed gradients will be unexpectedly mutated as well, + the unexpectedly muated stashed gradients may result in wrong + results. To fix it, these stashed gradients should not be pointed + to gradients, instead they should be copied from gradients when + ``gradient_as_bucket_view = true``. Attributes: module (Module): the module to be parallelized @@ -330,7 +352,8 @@ def __init__(self, module, device_ids=None, process_group=None, bucket_cap_mb=25, find_unused_parameters=False, - check_reduction=False): + check_reduction=False, + gradient_as_bucket_view=False): super(DistributedDataParallel, self).__init__() @@ -381,6 +404,7 @@ def __init__(self, module, device_ids=None, self.require_backward_grad_sync = True self.require_forward_param_sync = True self.ddp_join_enabled = False + self.gradient_as_bucket_view = gradient_as_bucket_view if check_reduction: # This argument is no longer used since the reducer @@ -516,7 +540,8 @@ def produces_sparse_gradient(module): self.process_group, expect_sparse_gradient, self.bucket_bytes_cap, - self.find_unused_parameters) + self.find_unused_parameters, + self.gradient_as_bucket_view) # passing a handle to torch.nn.SyncBatchNorm layer self._passing_sync_batchnorm_handle(self._module_copies) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 85b1d65a06ec..f6f2b9a6fbfb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -2096,6 +2096,14 @@ def _model_step(self, model): param += param.grad param.grad = None + def _model_step_with_zero_grad(self, model): + for param in model.parameters(): + if param.grad is not None: + with torch.no_grad(): + param += param.grad + param.grad.requires_grad_(False) + param.grad.zero_() + def _prepare_dummy_data(self, local_bs): # global_bs for DDP should be divisible by WORLD_SIZE world_size = int(os.environ["WORLD_SIZE"]) @@ -2118,7 +2126,8 @@ def _assert_equal_param(self, param_gpu, param_DDP): self.assertEqual(p_gpu, p_DDP) def _test_DDP_5iter( - self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, offset=None, world_size=0 + self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, + offset=None, world_size=0, zero_grad=False ): for idx in range(5): # single cpu/gpu training @@ -2137,8 +2146,12 @@ def _test_DDP_5iter( ) # Update weights and run a second iteration to shake out errors - self._model_step(model_base) - self._model_step(model_DDP) + if zero_grad: + self._model_step_with_zero_grad(model_base) + self._model_step_with_zero_grad(model_DDP) + else: + self._model_step(model_base) + self._model_step(model_DDP) self._assert_equal_param( list(model_base.parameters()), list(model_DDP.module.parameters()) ) @@ -2159,7 +2172,7 @@ def _test_DDP_5iter( for k in model_DDP.state_dict(): self.assertEqual(model_DDP.state_dict()[k], saved_model.state_dict()[k]) - def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): + def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gradient_as_bucket_view=False): # Run a simple end to end DDP model, use result of single node model # as baseline @@ -2174,7 +2187,7 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) model_DDP = nn.parallel.DistributedDataParallel( - model_DDP, device_ids=gpu_subset + model_DDP, device_ids=gpu_subset, gradient_as_bucket_view=gradient_as_bucket_view ) # test serializable/unserializable @@ -2196,14 +2209,11 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): local_bs, rank, global_bs, - True + True, ) self._barrier() - @unittest.skipIf( - BACKEND == "nccl", "nccl does not support DDP on CPU models" - ) - def test_DistributedDataParallelCPU(self): + def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False): # Run a simple end to end DDP-CPU model, use result of single node # model as baseline group, group_id, rank = self._init_global_test() @@ -2213,7 +2223,8 @@ def test_DistributedDataParallelCPU(self): # DDP-CPU training setup model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) + model_DDP = nn.parallel.DistributedDataParallel( + model_DDP, gradient_as_bucket_view=gradient_as_bucket_view) # dummy data initialization local_bs = 2 @@ -2221,10 +2232,22 @@ def test_DistributedDataParallelCPU(self): # check two model parameters over 5 iterations self._test_DDP_5iter( - model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False + model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False, zero_grad=True ) self._barrier() + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DDP on CPU models" + ) + def test_DistributedDataParallelCPU(self): + self._test_DistributedDataParallelCPU() + + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DDP on CPU models" + ) + def test_DistributedDataParallelCPU_grad_is_view(self): + self._test_DistributedDataParallelCPU(gradient_as_bucket_view=True) + @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo', "Only Nccl & Gloo backend support DistributedDataParallel") def test_DistributedDataParallel_requires_grad(self): @@ -2288,6 +2311,25 @@ def test_DistributedDataParallel(self): gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda')) + @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo', + "Only Nccl & Gloo backend support DistributedDataParallel") + @skip_if_no_gpu + @skip_if_rocm + def test_DistributedDataParallel_with_grad_is_view(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + gpus = list(rank_to_GPU[rank]) + self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, gradient_as_bucket_view=True) + + # test output_device + self._test_DistributedDataParallel( + gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True) + + # test device_ids + gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) + self._test_DistributedDataParallel( + gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True) + def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs, global_bs, offset, output_device=None): # Run a simple end to end DDP model, use result of single node model # as baseline From 0122299f9ba729aa0c9bd43764af53225e03672c Mon Sep 17 00:00:00 2001 From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com> Date: Thu, 24 Sep 2020 21:12:16 -0700 Subject: [PATCH 121/449] Enable distributed package on windows, Gloo backend supported only (#42897) Summary: Fixes https://github.com/pytorch/pytorch/issues/42095 For test case part will be committed to this PR later mrshenli, please help to review Pull Request resolved: https://github.com/pytorch/pytorch/pull/42897 Reviewed By: osalpekar Differential Revision: D23841786 Pulled By: mrshenli fbshipit-source-id: 334ba1ed73eff2f668857390fc32d1bc7f08e5f3 --- .../install_miniconda3.bat | 7 +++ CMakeLists.txt | 8 ++- caffe2/CMakeLists.txt | 49 +++++++++------ cmake/Dependencies.cmake | 5 +- test/cpp/dist_autograd/CMakeLists.txt | 2 +- test/distributed/test_c10d.py | 49 ++++++++++----- test/distributed/test_c10d_spawn.py | 8 ++- test/run_test.py | 11 ++-- tools/build_variables.bzl | 7 ++- torch/CMakeLists.txt | 33 +++++----- torch/csrc/Module.cpp | 4 +- torch/csrc/WindowsTorchApiMacro.h | 6 ++ torch/csrc/distributed/c10d/comm.h | 4 +- torch/csrc/distributed/c10d/init.cpp | 10 ++- torch/csrc/distributed/c10d/reducer.cpp | 22 +++---- torch/csrc/distributed/c10d/reducer.h | 14 +++++ torch/csrc/jit/python/pybind_utils.h | 8 +-- .../csrc/jit/python/python_sugared_value.cpp | 2 +- torch/csrc/jit/runtime/interpreter.cpp | 8 +-- torch/csrc/jit/serialization/pickler.cpp | 6 +- torch/csrc/jit/serialization/unpickler.cpp | 6 +- torch/csrc/utils/future.h | 2 +- torch/distributed/rendezvous.py | 14 ++++- torch/lib/c10d/CMakeLists.txt | 32 ++++++---- torch/lib/c10d/FileStore.cpp | 51 +++++++++++++++- torch/lib/c10d/GlooDeviceFactory.cpp | 33 ++++++---- torch/lib/c10d/ProcessGroupGloo.cpp | 61 ++++++++++++++++--- torch/lib/c10d/Utils.cpp | 3 +- torch/lib/c10d/Utils.hpp | 4 ++ torch/lib/c10d/test/CMakeLists.txt | 15 +++-- torch/lib/c10d/test/CUDATest.hpp | 10 ++- torch/lib/c10d/test/FileStoreTest.cpp | 8 +++ torch/lib/c10d/test/ProcessGroupGlooTest.cpp | 9 ++- torch/lib/c10d/test/TestUtils.hpp | 30 ++++++++- torch/testing/_internal/common_distributed.py | 17 +++++- torch/testing/_internal/common_utils.py | 4 ++ torch/testing/_internal/dist_utils.py | 3 +- .../ddp_under_dist_autograd_test.py | 16 ++--- .../_internal/distributed/distributed_test.py | 48 +++++++++++---- 39 files changed, 462 insertions(+), 167 deletions(-) diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index a66ef4b651c5..cf7255ce3789 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if "%REBUILD%"=="" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 call conda install -y -q -c conda-forge cmake + call conda install -y -q -c rdonnelly libuv ) + +:: Get installed libuv path +@echo off +set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library +@echo on +echo libuv_ROOT=%libuv_ROOT% diff --git a/CMakeLists.txt b/CMakeLists.txt index 826c187b602e..3d937e0e1655 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ endif() # For non-supported platforms, turn USE_DISTRIBUTED off by default. # It is not tested and likely won't work without additional changes. -if(NOT LINUX) +if(NOT LINUX AND NOT WIN32) set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed") # On macOS, if USE_DISTRIBUTED is enabled (specified by the user), # then make Gloo build with the libuv transport. @@ -226,6 +226,12 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF) +# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected +if(WIN32) + set(USE_TENSORPIPE OFF) + message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF") +endif() + # Linux distributions do not want too many embedded sources, in that sense we # need to be able to build pytorch with an (almost) empty third_party # directory. diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 65f072b6f29d..219b28c69695 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -291,26 +291,29 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(USE_DISTRIBUTED) - add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") - target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) - add_dependencies(process_group_agent torch c10d) # Define this target even if we're building without TensorPipe, to make life # easier to other targets that depend on this. However, in that case, by not # setting the USE_TENSORPIPE compile definition, this target will just end # up being empty. Downstream targets should also add a #ifdef guard. - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch c10d) - if(USE_TENSORPIPE) - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) + if(NOT WIN32) + add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") + target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) + add_dependencies(process_group_agent torch c10d) + + add_library(tensorpipe_agent + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" + ) + target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) + add_dependencies(tensorpipe_agent torch c10d) + if(USE_TENSORPIPE) + target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) + target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) + add_dependencies(tensorpipe_agent tensorpipe) + endif() endif() endif() @@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" ) endif() - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) append_filelist("libtorch_distributed_sources" TORCH_SRCS) endif() endif() @@ -837,7 +840,7 @@ endif() if(BUILD_TEST AND NOT USE_ROCM) add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr) - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc) endif() endif() @@ -889,9 +892,7 @@ endif() DESTINATION share/cmake/Torch) if(USE_DISTRIBUTED) - if(NOT MSVC) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) - endif() + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) endif() @@ -966,6 +967,14 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED ) + # Pass USE_RPC in order to reduce use of + # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) + # need to be removed when RPC is supported + if(NOT WIN32) + target_compile_definitions(torch_cpu PRIVATE + USE_RPC + ) + endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 028098f61d36..023bbe9e8d07 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1253,10 +1253,7 @@ if(USE_CUDA) endif() if(USE_GLOO) - if(MSVC) - message(WARNING "Gloo can not be used on Windows.") - caffe2_update_option(USE_GLOO OFF) - elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt index 5d23602881f0..9969c63e16d5 100644 --- a/test/cpp/dist_autograd/CMakeLists.txt +++ b/test/cpp/dist_autograd/CMakeLists.txt @@ -1,4 +1,4 @@ -if(USE_DISTRIBUTED) +if(USE_DISTRIBUTED AND NOT WIN32) set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd") set(DIST_AUTOGRAD_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index a81bc53f175a..911a73ce432e 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,7 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests + simple_sparse_reduce_tests, skip_if_win32, create_device from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -255,6 +255,7 @@ def create_tcp_store(addr): raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports)) +@skip_if_win32() class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): store = create_tcp_store('localhost') @@ -273,6 +274,7 @@ def test_address_already_in_use(self): store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 +@skip_if_win32() class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): super(PrefixTCPStoreTest, self).setUp() @@ -329,6 +331,7 @@ def test_unknown_handler(self): c10d.rendezvous('invalid://') +@skip_if_win32() class RendezvousEnvTest(TestCase): @retry_on_connect_failures def test_common_errors(self): @@ -455,7 +458,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile(delete=False) as file: - url = 'file://%s?world_size=%d' % (file.name, 2) + url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -474,6 +477,7 @@ def test_nominal(self): self.assertEqual(b"value1", store0.get("key1")) +@skip_if_win32() class RendezvousTCPTest(TestCase): def create_tcp_url(self): @@ -544,9 +548,13 @@ def _test_store_timeout(self, backend, init_method, c2p): def _init_methods(self): f = tempfile.NamedTemporaryFile(delete=False) - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() + if sys.platform == 'win32': + yield "file:///%s" % f.name.replace("\\", "/") + f.close() + else: + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() def _test_default_store_timeout(self, backend): for init_method in self._init_methods(): @@ -584,11 +592,16 @@ def test_default_store_timeout_gloo(self): class ProcessGroupGlooTest(MultiProcessTestCase): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - self._fork_processes() + + # For Windows platform, Python does not support fork, change it to spawn here. + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + opts.devices = [create_device(interface=LOOPBACK)] opts.timeout = 5.0 opts.threads = threads return opts @@ -598,8 +611,8 @@ def test_multi_device_constructor(self): opts = c10d.ProcessGroupGloo.Options() opts.timeout = 5.0 opts.devices = [ - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), ] pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) @@ -1514,6 +1527,7 @@ def test_barrier_implies_wait(self): for i, tensor in enumerate(tensors): self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) + @skip_if_win32() def test_round_robin(self): num_process_groups = 2 store = c10d.FileStore(self.file_name, self.world_size) @@ -1531,6 +1545,7 @@ def test_round_robin(self): pg.broadcast(tensor, root=0).wait() self.assertEqual(torch.full([100, 100], 0.), tensor) + @skip_if_win32() def test_round_robin_create_destroy(self): store = c10d.FileStore(self.file_name, self.world_size) @@ -1959,7 +1974,10 @@ def forward(self, x): class DistributedDataParallelTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor @@ -2068,7 +2086,7 @@ def update_parameters(model): def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @@ -3947,7 +3965,10 @@ def test_nccl_timeout(self): class CommTest(MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): super(CommTest, self).tearDown() @@ -4013,7 +4034,7 @@ def test_broadcast_coalesced_nccl(self): def test_broadcast_coalesced_gloo_cuda(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cuda:%d" % self.rank) ranks = list(range(self.world_size)) @@ -4024,7 +4045,7 @@ def test_broadcast_coalesced_gloo_cuda(self): def test_broadcast_coalesced_gloo_cpu(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cpu") ranks = list(range(self.world_size)) diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index d0bf00b8a08a..c84608e8f178 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -10,8 +10,10 @@ import torch.nn as nn from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo -from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm +from torch.testing._internal.common_distributed import requires_gloo, \ + create_device +from torch.testing._internal.common_utils import TestCase, load_tests, \ + run_tests, skipIfRocm from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN @@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase): @classmethod def opts(cls, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")] + opts.devices = [create_device(interface='lo')] opts.timeout = 5.0 opts.threads = threads return opts diff --git a/test/run_test.py b/test/run_test.py index d63fc372f9c2..0f9d14a78605 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,7 +13,7 @@ import torch import torch._six from torch.utils import cpp_extension -from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell +from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA import torch.distributed as dist from typing import Dict, Optional @@ -99,7 +99,6 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', - 'distributed/test_distributed_spawn', ] ROCM_BLOCKLIST = [ @@ -306,9 +305,13 @@ def test_distributed(test_module, test_directory, options): 'MPI not available -- MPI backend tests will be skipped') config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): + if sys.platform == 'win32' and backend != 'gloo': + continue if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: + if sys.platform == 'win32' and not with_init_file: + continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" @@ -322,9 +325,9 @@ def test_distributed(test_module, test_directory, options): os.environ.update(env_vars) if with_init_file: if test_module in ["test_distributed_fork", "test_distributed_spawn"]: - init_method = 'file://{}/'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/' else: - init_method = 'file://{}/shared_init_file'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 174bb858da44..c21fab8ec2cf 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -537,11 +537,14 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", ] -libtorch_python_distributed_sources = [ - "torch/csrc/distributed/autograd/init.cpp", +libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", +] + +libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ + "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/rpc/init.cpp", "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index b78dc4a362a7..2ae2f7f737fe 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -160,25 +160,28 @@ endif() if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) - if(NOT MSVC) + if(WIN32) + append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) + else() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() endif() + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) + list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) endif() -if(USE_NCCL) +if(USE_NCCL AND NOT WIN32) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ed4aa21a8f76..ae6f15155f2a 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -688,9 +688,9 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#ifdef USE_DISTRIBUTED -#ifdef USE_C10D +#if defined(USE_DISTRIBUTED) && defined(USE_C10D) THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); +#ifndef _WIN32 THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions()); THPUtils_addPyMethodDefs( methods, torch::distributed::autograd::python_functions()); diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 7f8ef4e01677..7f44db0baba9 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -5,3 +5,9 @@ // There's no difference between aten, torch and caffe2 libs any more // TODO: clean up the naming for consistency #define TORCH_API CAFFE2_API + +#ifdef _WIN32 +#define TORCH_PYTHON_API +#else +#define TORCH_PYTHON_API CAFFE2_API +#endif diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h index e2b501f08aff..2eb626c40232 100644 --- a/torch/csrc/distributed/c10d/comm.h +++ b/torch/csrc/distributed/c10d/comm.h @@ -38,7 +38,7 @@ class GradBucket { // DDP's c10d reducer allows communication hooks defined as a sub class // of CommHookInterface. CommHookInterface is an abstract class and can // be used to implement both Python and CPP hooks. -struct TORCH_API CommHookInterface { +struct TORCH_PYTHON_API CommHookInterface { public: virtual ~CommHookInterface() {} @@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface { // PythonCommHook enables registering a python hook to c10d reducer and is a // sub class of CommHookInterface. -class TORCH_API PythonCommHook : public CommHookInterface { +class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { public: // The constructor takes a state and a callable hook. Inputs are Python // objects. The state is passed to the hook in runHook function can be used to diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 165d6a1c8603..be1752d7366f 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,7 +1,11 @@ #include #include +#ifndef _WIN32 #include +#include +#include +#endif #include #ifdef USE_C10D_GLOO @@ -17,8 +21,6 @@ #endif #include -#include -#include #include #include @@ -323,6 +325,7 @@ They are used in specifying strategies for reduction collectives, e.g., shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store) .def(py::init()); +#ifndef _WIN32 shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store) .def(py::init<>()); @@ -340,6 +343,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("is_master"), py::arg("timeout") = std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); +#endif shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store) .def(py::init>()); @@ -607,6 +611,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("opts") = ::c10d::BarrierOptions(), py::call_guard()); +#ifndef _WIN32 module.def( "_round_robin_process_groups", [](std::vector> processGroups) @@ -620,6 +625,7 @@ They are used in specifying strategies for reduction collectives, e.g., }, py::arg("process_groups"), py::call_guard()); +#endif #ifdef USE_C10D_GLOO auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>( diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 86916c7994dd..814d3494ff4e 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -89,10 +89,7 @@ Reducer::Reducer( for (size_t variable_index = 0; variable_index < variable_count; variable_index++) { auto& variable = replicas_[replica_index][variable_index]; - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); // The gradient accumulator function is lazily initialized once. // Therefore we can use its presence in the autograd graph as @@ -100,15 +97,19 @@ Reducer::Reducer( auto grad_accumulator = torch::autograd::impl::grad_accumulator(variable); +#ifndef _WIN32 using torch::distributed::autograd::ThreadLocalDistAutogradContext; +#endif // Hook to execute after the gradient accumulator has executed. hooks_.emplace_back( grad_accumulator->add_post_hook( torch::make_unique( [=](const torch::autograd::variable_list& outputs, const torch::autograd::variable_list& /* unused */) { +#ifndef _WIN32 this->rpc_context_.set( ThreadLocalDistAutogradContext::getContextPtr()); +#endif this->autograd_hook(index); return outputs; })), @@ -477,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() { const auto variable_count = replicas_[replica_index].size(); for (size_t variable_index = 0; variable_index < variable_count; ++variable_index) { - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); push_rebuilt_params(index); } } @@ -850,10 +848,8 @@ void Reducer::initialize_buckets( TORCH_CHECK( variable_index < variable_locators_.size(), "Out of range variable index specified."); - variable_locators_[variable_index] = VariableLocator{ - .bucket_index = bucket_index, - .intra_bucket_index = intra_bucket_index++, - }; + variable_locators_[variable_index] = VariableLocator( + bucket_index, intra_bucket_index++); } bucket.variable_indices = std::move(bucket_indices[bucket_index]); @@ -1235,7 +1231,9 @@ void Reducer::runGradCallbackForVariable( cb(variable.mutable_grad()); } else { // Under distributed autograd +#ifndef _WIN32 context_ptr->runGradCallbackForVariable(variable, std::move(cb)); +#endif } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 960a32356acf..486b7337366a 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -104,6 +104,13 @@ class Reducer { struct VariableIndex { size_t replica_index; size_t variable_index; + + VariableIndex() = default; + + VariableIndex(size_t replica_index_, size_t variable_index_) { + replica_index = replica_index_; + variable_index = variable_index_; + } }; void push_rebuilt_params(const VariableIndex& index); @@ -281,6 +288,13 @@ class Reducer { size_t bucket_index; // Index of parameter in single bucket replica. size_t intra_bucket_index; + + VariableLocator() = default; + + VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) { + bucket_index = bucket_index_; + intra_bucket_index = intra_bucket_index_; + } }; // Map the index of a variable to its location in the bucket structure. diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 65f5a49145c8..4be55a9caa90 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) { if (py::isinstance(input)) { auto object = py::cast(input); return InferredType(object.type()); -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC } else if (py::isinstance(input)) { auto rref_ivalue = input.cast().toIValue(); return InferredType(rref_ivalue.type()); @@ -716,7 +716,7 @@ inline IValue toIValue( } } case TypeKind::RRefType: { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return obj.cast().toIValue(); #else AT_ERROR("RRef is only supported with the distributed package"); @@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) { } return std::move(py_dict); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC auto RRefPtr = c10::dynamic_intrusive_pointer_cast( std::move(ivalue).toRRef()); @@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) { auto py_class = getScriptedClassOrError(qualified_class_name); return py_class.attr(enum_holder->name().c_str()); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return py::cast(torch::distributed::rpc::PyRRef( c10::static_intrusive_pointer_cast( ivalue.toRRef()))); diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index ba94d33f37b3..119b6b5e5de7 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -916,7 +916,7 @@ std::shared_ptr toSugaredValue( } else if ( obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) { return SpecialFormValue::create(prim::annotate); -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on. } else if ( obj.ptr() == diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 337fe66c0789..f61e2597447f 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -23,7 +23,7 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include using torch::distributed::autograd::DistAutogradContainer; #endif @@ -267,7 +267,7 @@ void insertLastUses(Graph& g) { } inline int64_t getDistAutogradContextId() { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return DistAutogradContainer::currentContextId(); #else return 0; @@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState( : pImpl(std::move(pImpl_)) {} void InterpreterContinuation::operator()() { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC auto prev_dist_id = DistAutogradContainer::currentContextId(); DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_); #endif @@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() { } else { state.runAsync(stack); } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC DistAutogradContainer::forceCurrentContextId(prev_dist_id); #endif } diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 6f911f4246cc..2bc9abea8c57 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include #endif #include @@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { "this class."; AT_ERROR(err.str()); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC TORCH_CHECK( torch::distributed::rpc::getAllowJitRRefPickle() == true, "RRef jit pickling is only allowed inside RPC calls."); @@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) { } } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC void Pickler::pushRRef(const IValue& ivalue) { // It is the same as how rref is pickled in python, see PyRRef::pickle auto rrefInterface = ivalue.toRRef(); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index c416f9641023..9b8fce0b4869 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include #endif #include @@ -549,7 +549,7 @@ void Unpickler::readGlobal( stack_.emplace_back(int64_t(globals_.size() - 1)); return; } else if (module_name == "torch.distributed.rpc" && class_name == "rref") { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return rebuildRRef(); #else TORCH_INTERNAL_ASSERT( @@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) { }); } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC void Unpickler::rebuildRRef() { globals_.emplace_back([this] { // It is the same as how rref is unpickled in python, diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h index 6d672ee86cd5..093d043ecf7d 100644 --- a/torch/csrc/utils/future.h +++ b/torch/csrc/utils/future.h @@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception { // Most implementation is copied from FutureMessage and // c10::ivalue::Future template -class TORCH_API Future final { +class TORCH_PYTHON_API Future final { public: Future() = default; diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py index 292634580aab..4545aea2bf56 100644 --- a/torch/distributed/rendezvous.py +++ b/torch/distributed/rendezvous.py @@ -6,9 +6,12 @@ import torch._six as six import numbers import os -from . import FileStore, TCPStore +import sys +from . import FileStore from .constants import default_pg_timeout +if sys.platform != 'win32': + from . import TCPStore _rendezvous_handlers = {} @@ -90,6 +93,10 @@ def _error(msg): result = urlparse(url) path = result.path + if sys.platform == 'win32': + import urllib.request + path = urllib.request.url2pathname(result.path) + if not path: raise _error("path missing") query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) @@ -175,7 +182,8 @@ def _env_error(var): # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using env:// method") +if sys.platform != 'win32': + register_rendezvous_handler("tcp", _tcp_rendezvous_handler) + register_rendezvous_handler("env", _env_rendezvous_handler) register_rendezvous_handler("file", _file_rendezvous_handler) -register_rendezvous_handler("tcp", _tcp_rendezvous_handler) -register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 68fe49f411f5..4b206f380111 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -45,15 +45,16 @@ endfunction() set(C10D_SRCS FileStore.cpp - HashStore.cpp ProcessGroup.cpp - ProcessGroupRoundRobin.cpp Store.cpp PrefixStore.cpp - TCPStore.cpp Utils.cpp ) +if(NOT WIN32) + list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp) +endif() + set(C10D_LIBS torch) if(USE_C10D_NCCL) @@ -77,14 +78,17 @@ endif() add_library(c10d STATIC ${C10D_SRCS}) set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET c10d PROPERTY CXX_STANDARD 14) -target_compile_options(c10d PUBLIC - -Wall - -Wextra - -Wno-unused-parameter - -Wno-missing-field-initializers - -Wno-write-strings - -Wno-unknown-pragmas - ) + +if(NOT MSVC) + target_compile_options(c10d PUBLIC + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + ) +endif() add_dependencies(c10d torch) @@ -118,17 +122,19 @@ if(USE_C10D_GLOO) endif() copy_header(FileStore.hpp) -copy_header(HashStore.hpp) copy_header(PrefixStore.hpp) copy_header(ProcessGroup.hpp) copy_header(Store.hpp) -copy_header(TCPStore.hpp) copy_header(Types.hpp) copy_header(Utils.hpp) if(USE_GLOO) copy_header(ProcessGroupGloo.hpp) copy_header(GlooDeviceFactory.hpp) endif() +if(NOT WIN32) + copy_header(HashStore.hpp) + copy_header(TCPStore.hpp) +endif() if(USE_C10D_NCCL) copy_header(ProcessGroupNCCL.hpp) diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp index 55346e0fa635..eb25c52f787a 100644 --- a/torch/lib/c10d/FileStore.cpp +++ b/torch/lib/c10d/FileStore.cpp @@ -3,9 +3,16 @@ #include #include #include -#include #include + +#ifdef _WIN32 +#include +#include +#include +#else +#include #include +#endif #include #include @@ -21,6 +28,40 @@ throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \ } +#ifdef _WIN32 +#define LOCK_EX 0x00000001 +#define LOCK_SH 0x00000010 +#define LOCK_UN 0x00000100 + +int flock_(int fd, int op) { + HANDLE hdl = (HANDLE) _get_osfhandle(fd); + DWORD low = 1, high = 0; + OVERLAPPED offset = {0, 0, 0, 0, NULL}; + + if (hdl < 0) + return -1; + + switch (op) { + case LOCK_EX: + if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset)) + return 0; + break; + case LOCK_SH: + if (LockFileEx(hdl, 0, 0, low, high, &offset)) + return 0; + break; + case LOCK_UN: + if(UnlockFileEx(hdl, 0, low, high, &offset) != 0) + return 0; + break; + default: + break; + } + errno = EINVAL; + return -1; +} +#endif + namespace c10d { namespace { @@ -79,7 +120,11 @@ class Lock { int fd_{-1}; void flock(int operation) { +#ifdef _WIN32 + auto rv = syscall(std::bind(::flock_, fd_, operation)); +#else auto rv = syscall(std::bind(::flock, fd_, operation)); +#endif SYSASSERT(rv, "flock"); } }; @@ -92,7 +137,11 @@ class File { std::chrono::milliseconds timeout) { const auto start = std::chrono::steady_clock::now(); while (true) { +#ifdef _WIN32 + fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE)); +#else fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644)); +#endif // Only retry when the file doesn't exist, since we are waiting for the // file to be created in this case to address the following issue: // https://github.com/pytorch/pytorch/issues/13750 diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp index 70c3c2bb7a31..dca6b03eb9dd 100644 --- a/torch/lib/c10d/GlooDeviceFactory.cpp +++ b/torch/lib/c10d/GlooDeviceFactory.cpp @@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING( #if GLOO_HAVE_TRANSPORT_TCP static std::shared_ptr<::gloo::transport::Device> makeTCPDevice( - const std::string& interface, + const std::string& interfaceName, const std::string& hostname) { TORCH_CHECK( - !interface.empty() || !hostname.empty(), + !interfaceName.empty() || !hostname.empty(), "GlooDeviceFactory::makeTCPDevice(): interface or hostname " "can't be empty"); ::gloo::transport::tcp::attr attr; - if (!interface.empty()) { - attr.iface = interface; + if (!interfaceName.empty()) { + attr.iface = interfaceName; } else { attr.hostname = hostname; } @@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice); #if GLOO_HAVE_TRANSPORT_UV static std::shared_ptr<::gloo::transport::Device> makeUVDevice( - const std::string& interface, + const std::string& interfaceName, const std::string& hostname) { TORCH_CHECK( - !interface.empty() || !hostname.empty(), + !interfaceName.empty() || !hostname.empty(), "GlooDeviceFactory::makeUVDevice(): interface or hostname " "can't be empty"); ::gloo::transport::uv::attr attr; - if (!interface.empty()) { - attr.iface = interface; + if (!interfaceName.empty()) { + attr.iface = interfaceName; } else { attr.hostname = hostname; } @@ -81,23 +81,28 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice( // the flexibility of other application to override by priority. Register // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override. C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice); +C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice); C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice); #endif static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT"); std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: - makeDeviceForInterface(const std::string& interface) { + makeDeviceForInterface(const std::string& interfaceName) { if (glooDeviceTransport) { - return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, ""); + return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, ""); } #ifdef __linux__ - return GlooDeviceRegistry()->Create("LINUX", interface, ""); + return GlooDeviceRegistry()->Create("LINUX", interfaceName, ""); #endif #ifdef __APPLE__ - return GlooDeviceRegistry()->Create("APPLE", interface, ""); + return GlooDeviceRegistry()->Create("APPLE", interfaceName, ""); +#endif + +#ifdef _WIN32 + return GlooDeviceRegistry()->Create("WIN32", interfaceName, ""); #endif throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device"); @@ -117,6 +122,10 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: return GlooDeviceRegistry()->Create("APPLE", "", hostname); #endif +#ifdef _WIN32 + return GlooDeviceRegistry()->Create("WIN32", "", hostname); +#endif + throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device"); } diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 531fe751f1c9..c139ac7a34fd 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -2,10 +2,16 @@ #include +#ifdef _WIN32 +#include +#include +#include +#else #include #include -#include #include +#endif +#include #include @@ -36,6 +42,36 @@ #include #include +#ifdef _WIN32 +#define GENERATE_ALL_TYPES(type, func, ...) \ + switch (type) { \ + case ::at::ScalarType::Float: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Double: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Half: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Char: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Byte: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Int: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Long: \ + func(__VA_ARGS__); \ + break; \ + default: \ + throw std::runtime_error("Invalid scalar type"); \ + } + +#define HOST_NAME_MAX 256 +#else #define GENERATE_ALL_TYPES(type, func, args...) \ switch (type) { \ case ::at::ScalarType::Float: \ @@ -62,6 +98,7 @@ default: \ throw std::runtime_error("Invalid scalar type"); \ } +#endif namespace c10d { @@ -409,12 +446,19 @@ ProcessGroupGloo::Options::Options() namespace { +void socketInitialize() { +#ifdef _WIN32 + ::gloo::init_winsock(); +#endif +} + // Gloo assumes that this machine's hostname can always be resolved // to an address. If it doesn't it throws a runtime error saying // that it can't be resolved. Instead of catching it, we choose // to proactively check if an address can be resolved, so we can // gracefully fall back to an alternative if it doesn't. bool doesHostnameResolveToUsableAddress(const std::string& hostname) { + socketInitialize(); struct addrinfo hints; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; @@ -431,7 +475,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { continue; } rv = bind(fd, rp->ai_addr, rp->ai_addrlen); +#ifdef _WIN32 + closesocket(fd); +#else close(fd); +#endif if (rv == -1) { continue; } @@ -443,14 +491,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { } // namespace -#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: - createDeviceForInterface(const std::string& interface) { - return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface); + createDeviceForInterface(const std::string& interface_name) { + return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name); } -#endif -#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDeviceForHostname(const std::string& hostname) { TORCH_CHECK( @@ -460,14 +505,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: " to a (local) address"); return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname); } -#endif -#ifdef __linux__ +#if defined(__linux__) || defined(_WIN32) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDefaultDevice() { // Use the hostname to resolve the network address to // use. Note: if the hostname does not resolve to an address (e.g. // because of misconfigured /etc/hosts file), this will not work. + socketInitialize(); std::array hostname{}; auto rv = gethostname(hostname.data(), HOST_NAME_MAX); if (rv != 0) { diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp index d975f6eb6bc5..6c6e941ef95d 100644 --- a/torch/lib/c10d/Utils.cpp +++ b/torch/lib/c10d/Utils.cpp @@ -1,5 +1,6 @@ #include +#ifndef _WIN32 #include #include @@ -354,6 +355,6 @@ std::tuple accept( return std::make_tuple( socket, sockaddrToString(reinterpret_cast(&addr))); } - } // namespace tcputil } // namespace c10d +#endif diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 1bdaddde9f24..1116cd39ba1c 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -1,6 +1,8 @@ #pragma once +#ifndef _WIN32 #include +#endif #include #include @@ -480,6 +482,7 @@ class ResourceGuard { bool released_; }; +#ifndef _WIN32 namespace tcputil { constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1); @@ -609,4 +612,5 @@ std::tuple accept( const std::chrono::milliseconds& timeout = kNoTimeout); } // namespace tcputil +#endif } // namespace c10d diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index 8429d1099b29..003f56f30861 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -8,14 +8,19 @@ function(c10d_add_test test_src) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) - target_link_libraries(${test_name} pthread ${ARGN}) - target_compile_options(${test_name} PRIVATE -Wno-error) + target_link_libraries(${test_name} ${ARGN}) + if(NOT WIN32) + target_link_libraries(${test_name} pthread) + target_compile_options(${test_name} PRIVATE -Wno-error) + endif() add_test(NAME ${test_name} COMMAND $) endfunction() c10d_add_test(FileStoreTest.cpp c10d gtest_main) -c10d_add_test(HashStoreTest.cpp c10d gtest_main) -c10d_add_test(TCPStoreTest.cpp c10d gtest_main) +if(NOT WIN32) + c10d_add_test(HashStoreTest.cpp c10d gtest_main) + c10d_add_test(TCPStoreTest.cpp c10d gtest_main) +endif() if(USE_CUDA) if(USE_C10D_GLOO) @@ -29,7 +34,7 @@ if(USE_CUDA) endif() else() if(USE_C10D_GLOO) - c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main) + c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main) endif() endif() diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp index defaff895a18..328da2faf648 100644 --- a/torch/lib/c10d/test/CUDATest.hpp +++ b/torch/lib/c10d/test/CUDATest.hpp @@ -5,9 +5,15 @@ namespace c10d { namespace test { -void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); +#ifdef _WIN32 +#define EXPORT_TEST_API __declspec(dllexport) +#else +#define EXPORT_TEST_API +#endif -int cudaNumDevices(); +EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); + +EXPORT_TEST_API int cudaNumDevices(); } // namespace test } // namespace c10d diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp index 77215f4521c2..cc8da6326091 100644 --- a/torch/lib/c10d/test/FileStoreTest.cpp +++ b/torch/lib/c10d/test/FileStoreTest.cpp @@ -1,6 +1,8 @@ #include +#ifndef _WIN32 #include +#endif #include #include @@ -10,6 +12,11 @@ #include #include +#ifdef _WIN32 +std::string tmppath() { + return c10d::test::autoGenerateTmpFilePath(); +} +#else std::string tmppath() { const char* tmpdir = getenv("TMPDIR"); if (tmpdir == nullptr) { @@ -29,6 +36,7 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } +#endif void testGetSet(std::string path, std::string prefix = "") { // Basic Set/Get on File Store diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index 6606e553e733..da4f9b5fc106 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -1,7 +1,10 @@ +#ifndef _WIN32 #include -#include #include #include +#endif + +#include #include #include @@ -21,6 +24,7 @@ using namespace c10d::test; constexpr auto kSendDelay = std::chrono::milliseconds(100); constexpr auto kWaitTimeout = std::chrono::milliseconds(1); +#ifndef _WIN32 class SignalTest { public: SignalTest(const std::string& path) : path_(path) {} @@ -92,6 +96,7 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( test.arm(fork.pid, signal); return test.run(0, 2); } +#endif class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo { public: @@ -456,6 +461,7 @@ void testRecv(const std::string& path) { EXPECT_TRUE(recvCompleted); } +#ifndef _WIN32 TEST(ProcessGroupGlooTest, testSIGSTOPException) { // test SIGSTOP // Fork() and TSAN don't play well together, so skip the test if we're testing @@ -485,6 +491,7 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) { EXPECT_FALSE(work->isSuccess()); EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception); } +#endif TEST(ProcessGroupGlooTest, testAllReduceCPU) { { diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp index c62695485573..5f5dfca315cb 100644 --- a/torch/lib/c10d/test/TestUtils.hpp +++ b/torch/lib/c10d/test/TestUtils.hpp @@ -1,9 +1,12 @@ #pragma once +#ifndef _WIN32 #include -#include #include #include +#endif + +#include #include #include @@ -37,6 +40,28 @@ class Semaphore { std::condition_variable cv_; }; +#ifdef _WIN32 +std::string autoGenerateTmpFilePath() { + char tmp[L_tmpnam_s]; + errno_t err; + err = tmpnam_s(tmp, L_tmpnam_s); + if (err != 0) + { + throw std::system_error(errno, std::system_category()); + } + return std::string(tmp); +} + +std::string tmppath() { + const char* tmpfile = getenv("TMPFILE"); + if (tmpfile) { + return std::string(tmpfile); + } + else { + return autoGenerateTmpFilePath(); + } +} +#else std::string tmppath() { // TMPFILE is for manual test execution during which the user will specify // the full temp file path using the environmental variable TMPFILE @@ -63,6 +88,7 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } +#endif bool isTSANEnabled() { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); @@ -80,6 +106,7 @@ struct TemporaryFile { } }; +#ifndef _WIN32 struct Fork { pid_t pid; @@ -101,6 +128,7 @@ struct Fork { return pid == 0; } }; +#endif } // namespace test } // namespace c10d diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index f8e5b4822bd8..b2cd30c66812 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -16,7 +16,7 @@ import torch.distributed as c10d from functools import partial, reduce -from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM +from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA class TestSkip(NamedTuple): exit_code: int @@ -143,10 +143,23 @@ def wrapper(*args, **kwargs): return wrapper +def skip_if_win32(): + return unittest.skipIf( + sys.platform == 'win32', + "This unit test case is not supportted on Windows platform", + ) + TIMEOUT_DEFAULT = 100 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400} +def create_device(interface=None): + if sys.platform == 'win32' or interface is None: + return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1") + else: + return c10d.ProcessGroupGloo.create_device(interface=interface) + + def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) @@ -206,7 +219,7 @@ def initialize_temp_directories(init_method=None): if init_method is not None: os.environ["INIT_METHOD"] = init_method else: - os.environ["INIT_METHOD"] = "file://" + os.path.join( + os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join( init_dir_path, "shared_init_file" ) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 9959551031ff..36434ff8aa2f 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -53,6 +53,10 @@ torch.backends.disable_global_flags() +FILE_SCHEMA = "file://" +if sys.platform == 'win32': + FILE_SCHEMA = "file:///" + IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle' class ProfilingMode(Enum): diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py index b88765211df1..93de304a53ca 100644 --- a/torch/testing/_internal/dist_utils.py +++ b/torch/testing/_internal/dist_utils.py @@ -7,6 +7,7 @@ import torch.distributed as dist import torch.distributed.rpc as rpc from torch.distributed.rpc import _rref_context_get_debug_info # type: ignore[attr-defined] +from torch.testing._internal.common_utils import FILE_SCHEMA if not dist.is_available(): @@ -14,7 +15,7 @@ sys.exit(0) -INIT_METHOD_TEMPLATE = "file://{file_name}" +INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}" def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True, diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py index 1b1f755ed4cc..09db831e9999 100644 --- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py +++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py @@ -20,7 +20,7 @@ skip_if_lt_x_gpu, skip_if_rocm, ) -from torch.testing._internal.dist_utils import dist_init +from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import ( RpcAgentTestFixture, ) @@ -329,7 +329,7 @@ def _remote_worker_process(self): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -346,7 +346,7 @@ def _trainer_process(self, rank: int): ) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool): gLogger.info("Running the master process...") dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index f6f2b9a6fbfb..af5e648f6acb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1,5 +1,4 @@ import copy -import fcntl import itertools import random import math @@ -22,6 +21,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember +from torch.testing._internal.common_utils import FILE_SCHEMA from torch.testing._internal.common_distributed import ( MultiProcessTestCase, TEST_SKIPS, @@ -43,6 +43,10 @@ except ImportError: HAS_TORCHVISION = False +if sys.platform == 'win32': + import msvcrt +else: + import fcntl class Foo: def __init__(self, x): @@ -191,10 +195,17 @@ def _lock(): lockfile = os.path.join(TEMP_DIR, "lockfile") with open(lockfile, "w") as lf: try: - fcntl.flock(lf.fileno(), fcntl.LOCK_EX) - yield + if sys.platform == 'win32': + msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1) + yield + else: + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield finally: - fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + if sys.platform == 'win32': + msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) + else: + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) lf.close() @@ -270,7 +281,7 @@ def tearDown(self): @property def init_method(self): - return "file://{file_name}".format(file_name=self.file_name) + return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name) @classmethod def _run(cls, rank, test_name, file_name): @@ -2162,8 +2173,13 @@ def _test_DDP_5iter( # save the model in the middle and reload if test_save and idx == 2 and INIT_METHOD.startswith("file://"): with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) with tempfile.TemporaryFile() as tmp_file: torch.save(model_DDP, tmp_file) @@ -2192,8 +2208,13 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # dummy data initialization local_bs = len(gpu_subset) @@ -2350,8 +2371,13 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # data initialization input_cpu = torch.randn(global_bs, 2) From 31ae8117baec653f6d7688d33dbabc31be5378e1 Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Thu, 24 Sep 2020 22:00:37 -0700 Subject: [PATCH 122/449] [RFC] Remove per-op-registration related code in caffe2/tools/codegen/gen.py (#45134) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45134 Per-Op-Registration was a mechanism used for mobile selective build v0. Since then, a new dispathing mechanism has been built for PyTorch, and this code path isn't used any more. Remove it to simplify understanding/updating the code-generator's code-flow. ghstack-source-id: 112723942 Test Plan: `buck build` and sandcastle. Reviewed By: ezyang Differential Revision: D23806632 fbshipit-source-id: d93cd324650c541d9bfc8eeff2ddb2833b988ecc --- aten/src/ATen/templates/PerOpRegistration.cpp | 15 ------ tools/codegen/gen.py | 53 ++----------------- 2 files changed, 4 insertions(+), 64 deletions(-) delete mode 100644 aten/src/ATen/templates/PerOpRegistration.cpp diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp deleted file mode 100644 index 72ac3d784dad..000000000000 --- a/aten/src/ATen/templates/PerOpRegistration.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// ${generated_comment} - -#include -#include -#include -#include -$extra_headers - -namespace at { - -TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) { - ${function_registrations} -} - -} // namespace at diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index be8c57f1061a..83d9fa04cf37 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -2,7 +2,7 @@ import contextlib import textwrap import itertools -from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence +from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, Union, Sequence import yaml from enum import Enum from collections import OrderedDict @@ -914,11 +914,6 @@ def main() -> None: nargs='*', help='filter dispatch backend by the whitelist (if set), ' 'e.g.: CPU CUDA QuantizedCPU ...') - parser.add_argument( - '--per_op_registration', - action='store_true', - help='group function registrations by op name and write to separate files; ' - 'must also set --op_registration_whitelist param') parser.add_argument( '--force_schema_registration', action='store_true', @@ -1011,8 +1006,7 @@ def make_file_manager(install_dir: str) -> FileManager: 'function_registrations': list(mapMaybe( compute_type_method( dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions - )) if not options.per_op_registration else [], + native_functions)), }) del fm @@ -1037,11 +1031,11 @@ def make_file_manager(install_dir: str) -> FileManager: 'function_registrations': list(mapMaybe( compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions)) if not options.per_op_registration else [], + native_functions)), 'math_function_registrations': list(mapMaybe( compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions)) if not options.per_op_registration else [], + native_functions)), }) cpu_fm.write('Functions.h', lambda: { 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), @@ -1080,45 +1074,6 @@ def computeSchemaRegister() -> Dict[str, object]: } cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister) - if options.per_op_registration: - def gen_per_op_registration_filename(opname: str) -> str: - return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-')) - - if op_registration_whitelist is None: - raise Exception("Must set --op_registration_whitelist for per-op registration.") - - # First, group all native functions by unoverloaded operator name - grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list) - for f in native_functions: - grouped_functions[f"aten::{f.func.name.name}"].append(f) - extra_headers = [] - for b in backends: - extra_headers.append(f'#include ') - - # Next, generate registration for each one - for name in op_registration_whitelist: - def computePerOpRegistration() -> Dict[str, object]: - fs = grouped_functions[name] - registrations: List[str] = [] - for mb_dispatch in itertools.chain([None], backends): - # or you could pass in op_registration_whitelist, it doesn't - # matter! - # NB: Use of compute_type_method here is kind of an abuse; - # this is why we have to unconditionally write in - # torch::dispatch in the registration when it should be - # contextually clear - registrations.extend( - mapMaybe( - compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None), - fs)) - return { - 'extra_headers': extra_headers, - 'function_registrations': registrations, - } - - cpu_fm.write_with_template( - gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration) - cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions)))) if options.output_dependencies: From bc3151dee0b73e10c64788fce2d822e96aeffb4a Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 24 Sep 2020 22:10:52 -0700 Subject: [PATCH 123/449] [quant] Remove unused qconfig argument in qat linear module (#45307) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45307 fixes: https://github.com/pytorch/pytorch/issues/35634 Test Plan: Imported from OSS Reviewed By: vkuzo Differential Revision: D23917339 fbshipit-source-id: 65f8844b98198bbf93547b3d71408c2a54605218 --- torch/nn/intrinsic/qat/modules/conv_fused.py | 17 ++++++++--------- torch/nn/intrinsic/qat/modules/linear_relu.py | 4 ++-- torch/nn/qat/modules/conv.py | 7 +++---- torch/nn/qat/modules/linear.py | 7 +++---- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py index db46bb5ac2ee..5a8b0f042db1 100644 --- a/torch/nn/intrinsic/qat/modules/conv_fused.py +++ b/torch/nn/intrinsic/qat/modules/conv_fused.py @@ -162,7 +162,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -170,10 +170,9 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' - qconfig = mod.qconfig + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' + qconfig = mod.qconfig conv, bn = mod[0], mod[1] qat_convbn = cls(conv.in_channels, conv.out_channels, conv.kernel_size, conv.stride, conv.padding, conv.dilation, @@ -278,8 +277,8 @@ def forward(self, input): return F.relu(ConvBn2d._forward(self, input)) @classmethod - def from_float(cls, mod, qconfig=None): - return super(ConvBnReLU2d, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(ConvBnReLU2d, cls).from_float(mod) class ConvReLU2d(nnqat.Conv2d): r""" @@ -313,8 +312,8 @@ def forward(self, input): self._conv_forward(input, self.weight_fake_quant(self.weight))) @classmethod - def from_float(cls, mod, qconfig=None): - return super(ConvReLU2d, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(ConvReLU2d, cls).from_float(mod) def update_bn_stats(mod): if type(mod) in set([ConvBnReLU2d, ConvBn2d]): diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py index 03f556c4ac2e..b11072ddb7be 100644 --- a/torch/nn/intrinsic/qat/modules/linear_relu.py +++ b/torch/nn/intrinsic/qat/modules/linear_relu.py @@ -34,5 +34,5 @@ def forward(self, input): return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias)) @classmethod - def from_float(cls, mod, qconfig=None): - return super(LinearReLU, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(LinearReLU, cls).from_float(mod) diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py index 63fb4b0fa1fd..7daeecddd4e1 100644 --- a/torch/nn/qat/modules/conv.py +++ b/torch/nn/qat/modules/conv.py @@ -32,7 +32,7 @@ def forward(self, input): return self._conv_forward(input, self.weight_fake_quant(self.weight)) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -40,9 +40,8 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' if type(mod) == ConvReLU2d: mod = mod[0] qconfig = mod.qconfig diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py index 77998426239f..47fc40b9b6c0 100644 --- a/torch/nn/qat/modules/linear.py +++ b/torch/nn/qat/modules/linear.py @@ -30,7 +30,7 @@ def forward(self, input): return F.linear(input, self.weight_fake_quant(self.weight), self.bias) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -38,9 +38,8 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' if type(mod) == LinearReLU: mod = mod[0] From 103fa3894a0dff4bd697688a4a5d6095cd45162e Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 24 Sep 2020 22:42:46 -0700 Subject: [PATCH 124/449] Revert D23841786: [pytorch][PR] Enable distributed package on windows, Gloo backend supported only Test Plan: revert-hammer Differential Revision: D23841786 (https://github.com/pytorch/pytorch/commit/0122299f9ba729aa0c9bd43764af53225e03672c) Original commit changeset: 334ba1ed73ef fbshipit-source-id: ec95432f9957df56a5a04e52661f5db920b7f57f --- .../install_miniconda3.bat | 7 --- CMakeLists.txt | 8 +-- caffe2/CMakeLists.txt | 49 ++++++--------- cmake/Dependencies.cmake | 5 +- test/cpp/dist_autograd/CMakeLists.txt | 2 +- test/distributed/test_c10d.py | 49 +++++---------- test/distributed/test_c10d_spawn.py | 8 +-- test/run_test.py | 11 ++-- tools/build_variables.bzl | 7 +-- torch/CMakeLists.txt | 33 +++++----- torch/csrc/Module.cpp | 4 +- torch/csrc/WindowsTorchApiMacro.h | 6 -- torch/csrc/distributed/c10d/comm.h | 4 +- torch/csrc/distributed/c10d/init.cpp | 10 +-- torch/csrc/distributed/c10d/reducer.cpp | 22 ++++--- torch/csrc/distributed/c10d/reducer.h | 14 ----- torch/csrc/jit/python/pybind_utils.h | 8 +-- .../csrc/jit/python/python_sugared_value.cpp | 2 +- torch/csrc/jit/runtime/interpreter.cpp | 8 +-- torch/csrc/jit/serialization/pickler.cpp | 6 +- torch/csrc/jit/serialization/unpickler.cpp | 6 +- torch/csrc/utils/future.h | 2 +- torch/distributed/rendezvous.py | 14 +---- torch/lib/c10d/CMakeLists.txt | 32 ++++------ torch/lib/c10d/FileStore.cpp | 51 +--------------- torch/lib/c10d/GlooDeviceFactory.cpp | 33 ++++------ torch/lib/c10d/ProcessGroupGloo.cpp | 61 +++---------------- torch/lib/c10d/Utils.cpp | 3 +- torch/lib/c10d/Utils.hpp | 4 -- torch/lib/c10d/test/CMakeLists.txt | 15 ++--- torch/lib/c10d/test/CUDATest.hpp | 10 +-- torch/lib/c10d/test/FileStoreTest.cpp | 8 --- torch/lib/c10d/test/ProcessGroupGlooTest.cpp | 9 +-- torch/lib/c10d/test/TestUtils.hpp | 30 +-------- torch/testing/_internal/common_distributed.py | 17 +----- torch/testing/_internal/common_utils.py | 4 -- torch/testing/_internal/dist_utils.py | 3 +- .../ddp_under_dist_autograd_test.py | 16 ++--- .../_internal/distributed/distributed_test.py | 48 ++++----------- 39 files changed, 167 insertions(+), 462 deletions(-) diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index cf7255ce3789..a66ef4b651c5 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -12,11 +12,4 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if "%REBUILD%"=="" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 call conda install -y -q -c conda-forge cmake - call conda install -y -q -c rdonnelly libuv ) - -:: Get installed libuv path -@echo off -set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library -@echo on -echo libuv_ROOT=%libuv_ROOT% diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d937e0e1655..826c187b602e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ endif() # For non-supported platforms, turn USE_DISTRIBUTED off by default. # It is not tested and likely won't work without additional changes. -if(NOT LINUX AND NOT WIN32) +if(NOT LINUX) set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed") # On macOS, if USE_DISTRIBUTED is enabled (specified by the user), # then make Gloo build with the libuv transport. @@ -226,12 +226,6 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF) -# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected -if(WIN32) - set(USE_TENSORPIPE OFF) - message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF") -endif() - # Linux distributions do not want too many embedded sources, in that sense we # need to be able to build pytorch with an (almost) empty third_party # directory. diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 219b28c69695..65f072b6f29d 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -291,29 +291,26 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(USE_DISTRIBUTED) + add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") + target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) + add_dependencies(process_group_agent torch c10d) # Define this target even if we're building without TensorPipe, to make life # easier to other targets that depend on this. However, in that case, by not # setting the USE_TENSORPIPE compile definition, this target will just end # up being empty. Downstream targets should also add a #ifdef guard. - if(NOT WIN32) - add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") - target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) - add_dependencies(process_group_agent torch c10d) - - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch c10d) - if(USE_TENSORPIPE) - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) - endif() + add_library(tensorpipe_agent + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" + ) + target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) + add_dependencies(tensorpipe_agent torch c10d) + if(USE_TENSORPIPE) + target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) + target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) + add_dependencies(tensorpipe_agent tensorpipe) endif() endif() @@ -496,7 +493,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" ) endif() - if(USE_DISTRIBUTED AND NOT WIN32) + if(USE_DISTRIBUTED) append_filelist("libtorch_distributed_sources" TORCH_SRCS) endif() endif() @@ -840,7 +837,7 @@ endif() if(BUILD_TEST AND NOT USE_ROCM) add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr) - if(USE_DISTRIBUTED AND NOT WIN32) + if(USE_DISTRIBUTED) add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc) endif() endif() @@ -892,7 +889,9 @@ endif() DESTINATION share/cmake/Torch) if(USE_DISTRIBUTED) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) + if(NOT MSVC) + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) + endif() endif() @@ -967,14 +966,6 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED ) - # Pass USE_RPC in order to reduce use of - # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) - # need to be removed when RPC is supported - if(NOT WIN32) - target_compile_definitions(torch_cpu PRIVATE - USE_RPC - ) - endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 023bbe9e8d07..028098f61d36 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1253,7 +1253,10 @@ if(USE_CUDA) endif() if(USE_GLOO) - if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(MSVC) + message(WARNING "Gloo can not be used on Windows.") + caffe2_update_option(USE_GLOO OFF) + elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt index 9969c63e16d5..5d23602881f0 100644 --- a/test/cpp/dist_autograd/CMakeLists.txt +++ b/test/cpp/dist_autograd/CMakeLists.txt @@ -1,4 +1,4 @@ -if(USE_DISTRIBUTED AND NOT WIN32) +if(USE_DISTRIBUTED) set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd") set(DIST_AUTOGRAD_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 911a73ce432e..a81bc53f175a 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,7 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests, skip_if_win32, create_device + simple_sparse_reduce_tests from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -255,7 +255,6 @@ def create_tcp_store(addr): raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports)) -@skip_if_win32() class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): store = create_tcp_store('localhost') @@ -274,7 +273,6 @@ def test_address_already_in_use(self): store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 -@skip_if_win32() class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): super(PrefixTCPStoreTest, self).setUp() @@ -331,7 +329,6 @@ def test_unknown_handler(self): c10d.rendezvous('invalid://') -@skip_if_win32() class RendezvousEnvTest(TestCase): @retry_on_connect_failures def test_common_errors(self): @@ -458,7 +455,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile(delete=False) as file: - url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' + url = 'file://%s?world_size=%d' % (file.name, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -477,7 +474,6 @@ def test_nominal(self): self.assertEqual(b"value1", store0.get("key1")) -@skip_if_win32() class RendezvousTCPTest(TestCase): def create_tcp_url(self): @@ -548,13 +544,9 @@ def _test_store_timeout(self, backend, init_method, c2p): def _init_methods(self): f = tempfile.NamedTemporaryFile(delete=False) - if sys.platform == 'win32': - yield "file:///%s" % f.name.replace("\\", "/") - f.close() - else: - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() def _test_default_store_timeout(self, backend): for init_method in self._init_methods(): @@ -592,16 +584,11 @@ def test_default_store_timeout_gloo(self): class ProcessGroupGlooTest(MultiProcessTestCase): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - - # For Windows platform, Python does not support fork, change it to spawn here. - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [create_device(interface=LOOPBACK)] + opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] opts.timeout = 5.0 opts.threads = threads return opts @@ -611,8 +598,8 @@ def test_multi_device_constructor(self): opts = c10d.ProcessGroupGloo.Options() opts.timeout = 5.0 opts.devices = [ - create_device(interface=LOOPBACK), - create_device(interface=LOOPBACK), + c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), + c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), ] pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) @@ -1527,7 +1514,6 @@ def test_barrier_implies_wait(self): for i, tensor in enumerate(tensors): self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) - @skip_if_win32() def test_round_robin(self): num_process_groups = 2 store = c10d.FileStore(self.file_name, self.world_size) @@ -1545,7 +1531,6 @@ def test_round_robin(self): pg.broadcast(tensor, root=0).wait() self.assertEqual(torch.full([100, 100], 0.), tensor) - @skip_if_win32() def test_round_robin_create_destroy(self): store = c10d.FileStore(self.file_name, self.world_size) @@ -1974,10 +1959,7 @@ def forward(self, x): class DistributedDataParallelTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelTest, self).setUp() - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor @@ -2086,7 +2068,7 @@ def update_parameters(model): def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @@ -3965,10 +3947,7 @@ def test_nccl_timeout(self): class CommTest(MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def tearDown(self): super(CommTest, self).tearDown() @@ -4034,7 +4013,7 @@ def test_broadcast_coalesced_nccl(self): def test_broadcast_coalesced_gloo_cuda(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cuda:%d" % self.rank) ranks = list(range(self.world_size)) @@ -4045,7 +4024,7 @@ def test_broadcast_coalesced_gloo_cuda(self): def test_broadcast_coalesced_gloo_cpu(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cpu") ranks = list(range(self.world_size)) diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index c84608e8f178..d0bf00b8a08a 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -10,10 +10,8 @@ import torch.nn as nn from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo, \ - create_device -from torch.testing._internal.common_utils import TestCase, load_tests, \ - run_tests, skipIfRocm +from torch.testing._internal.common_distributed import requires_gloo +from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN @@ -41,7 +39,7 @@ class ProcessGroupShareTensorTest(TestCase): @classmethod def opts(cls, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [create_device(interface='lo')] + opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")] opts.timeout = 5.0 opts.threads = threads return opts diff --git a/test/run_test.py b/test/run_test.py index 0f9d14a78605..d63fc372f9c2 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,7 +13,7 @@ import torch import torch._six from torch.utils import cpp_extension -from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA +from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell import torch.distributed as dist from typing import Dict, Optional @@ -99,6 +99,7 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', ] ROCM_BLOCKLIST = [ @@ -305,13 +306,9 @@ def test_distributed(test_module, test_directory, options): 'MPI not available -- MPI backend tests will be skipped') config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): - if sys.platform == 'win32' and backend != 'gloo': - continue if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: - if sys.platform == 'win32' and not with_init_file: - continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" @@ -325,9 +322,9 @@ def test_distributed(test_module, test_directory, options): os.environ.update(env_vars) if with_init_file: if test_module in ["test_distributed_fork", "test_distributed_spawn"]: - init_method = f'{FILE_SCHEMA}{tmp_dir}/' + init_method = 'file://{}/'.format(tmp_dir) else: - init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' + init_method = 'file://{}/shared_init_file'.format(tmp_dir) os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index c21fab8ec2cf..174bb858da44 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -537,14 +537,11 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", ] -libtorch_python_distributed_core_sources = [ +libtorch_python_distributed_sources = [ + "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", -] - -libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ - "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/rpc/init.cpp", "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 2ae2f7f737fe..b78dc4a362a7 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -160,28 +160,25 @@ endif() if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) - if(WIN32) - append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) - else() - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) + if(NOT MSVC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) + list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() endif() - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) endif() -if(USE_NCCL AND NOT WIN32) +if(USE_NCCL) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ae6f15155f2a..ed4aa21a8f76 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -688,9 +688,9 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#if defined(USE_DISTRIBUTED) && defined(USE_C10D) +#ifdef USE_DISTRIBUTED +#ifdef USE_C10D THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); -#ifndef _WIN32 THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions()); THPUtils_addPyMethodDefs( methods, torch::distributed::autograd::python_functions()); diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 7f44db0baba9..7f8ef4e01677 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -5,9 +5,3 @@ // There's no difference between aten, torch and caffe2 libs any more // TODO: clean up the naming for consistency #define TORCH_API CAFFE2_API - -#ifdef _WIN32 -#define TORCH_PYTHON_API -#else -#define TORCH_PYTHON_API CAFFE2_API -#endif diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h index 2eb626c40232..e2b501f08aff 100644 --- a/torch/csrc/distributed/c10d/comm.h +++ b/torch/csrc/distributed/c10d/comm.h @@ -38,7 +38,7 @@ class GradBucket { // DDP's c10d reducer allows communication hooks defined as a sub class // of CommHookInterface. CommHookInterface is an abstract class and can // be used to implement both Python and CPP hooks. -struct TORCH_PYTHON_API CommHookInterface { +struct TORCH_API CommHookInterface { public: virtual ~CommHookInterface() {} @@ -59,7 +59,7 @@ struct TORCH_PYTHON_API CommHookInterface { // PythonCommHook enables registering a python hook to c10d reducer and is a // sub class of CommHookInterface. -class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { +class TORCH_API PythonCommHook : public CommHookInterface { public: // The constructor takes a state and a callable hook. Inputs are Python // objects. The state is passed to the hook in runHook function can be used to diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index be1752d7366f..165d6a1c8603 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,11 +1,7 @@ #include #include -#ifndef _WIN32 #include -#include -#include -#endif #include #ifdef USE_C10D_GLOO @@ -21,6 +17,8 @@ #endif #include +#include +#include #include #include @@ -325,7 +323,6 @@ They are used in specifying strategies for reduction collectives, e.g., shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store) .def(py::init()); -#ifndef _WIN32 shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store) .def(py::init<>()); @@ -343,7 +340,6 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("is_master"), py::arg("timeout") = std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); -#endif shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store) .def(py::init>()); @@ -611,7 +607,6 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("opts") = ::c10d::BarrierOptions(), py::call_guard()); -#ifndef _WIN32 module.def( "_round_robin_process_groups", [](std::vector> processGroups) @@ -625,7 +620,6 @@ They are used in specifying strategies for reduction collectives, e.g., }, py::arg("process_groups"), py::call_guard()); -#endif #ifdef USE_C10D_GLOO auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>( diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 814d3494ff4e..86916c7994dd 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -89,7 +89,10 @@ Reducer::Reducer( for (size_t variable_index = 0; variable_index < variable_count; variable_index++) { auto& variable = replicas_[replica_index][variable_index]; - const auto index = VariableIndex(replica_index, variable_index); + const auto index = VariableIndex{ + .replica_index = replica_index, + .variable_index = variable_index, + }; // The gradient accumulator function is lazily initialized once. // Therefore we can use its presence in the autograd graph as @@ -97,19 +100,15 @@ Reducer::Reducer( auto grad_accumulator = torch::autograd::impl::grad_accumulator(variable); -#ifndef _WIN32 using torch::distributed::autograd::ThreadLocalDistAutogradContext; -#endif // Hook to execute after the gradient accumulator has executed. hooks_.emplace_back( grad_accumulator->add_post_hook( torch::make_unique( [=](const torch::autograd::variable_list& outputs, const torch::autograd::variable_list& /* unused */) { -#ifndef _WIN32 this->rpc_context_.set( ThreadLocalDistAutogradContext::getContextPtr()); -#endif this->autograd_hook(index); return outputs; })), @@ -478,7 +477,10 @@ void Reducer::push_rebuilt_params_for_all_indices() { const auto variable_count = replicas_[replica_index].size(); for (size_t variable_index = 0; variable_index < variable_count; ++variable_index) { - const auto index = VariableIndex(replica_index, variable_index); + const auto index = VariableIndex{ + .replica_index = replica_index, + .variable_index = variable_index, + }; push_rebuilt_params(index); } } @@ -848,8 +850,10 @@ void Reducer::initialize_buckets( TORCH_CHECK( variable_index < variable_locators_.size(), "Out of range variable index specified."); - variable_locators_[variable_index] = VariableLocator( - bucket_index, intra_bucket_index++); + variable_locators_[variable_index] = VariableLocator{ + .bucket_index = bucket_index, + .intra_bucket_index = intra_bucket_index++, + }; } bucket.variable_indices = std::move(bucket_indices[bucket_index]); @@ -1231,9 +1235,7 @@ void Reducer::runGradCallbackForVariable( cb(variable.mutable_grad()); } else { // Under distributed autograd -#ifndef _WIN32 context_ptr->runGradCallbackForVariable(variable, std::move(cb)); -#endif } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 486b7337366a..960a32356acf 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -104,13 +104,6 @@ class Reducer { struct VariableIndex { size_t replica_index; size_t variable_index; - - VariableIndex() = default; - - VariableIndex(size_t replica_index_, size_t variable_index_) { - replica_index = replica_index_; - variable_index = variable_index_; - } }; void push_rebuilt_params(const VariableIndex& index); @@ -288,13 +281,6 @@ class Reducer { size_t bucket_index; // Index of parameter in single bucket replica. size_t intra_bucket_index; - - VariableLocator() = default; - - VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) { - bucket_index = bucket_index_; - intra_bucket_index = intra_bucket_index_; - } }; // Map the index of a variable to its location in the bucket structure. diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 4be55a9caa90..65f5a49145c8 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) { if (py::isinstance(input)) { auto object = py::cast(input); return InferredType(object.type()); -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED } else if (py::isinstance(input)) { auto rref_ivalue = input.cast().toIValue(); return InferredType(rref_ivalue.type()); @@ -716,7 +716,7 @@ inline IValue toIValue( } } case TypeKind::RRefType: { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return obj.cast().toIValue(); #else AT_ERROR("RRef is only supported with the distributed package"); @@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) { } return std::move(py_dict); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED auto RRefPtr = c10::dynamic_intrusive_pointer_cast( std::move(ivalue).toRRef()); @@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) { auto py_class = getScriptedClassOrError(qualified_class_name); return py_class.attr(enum_holder->name().c_str()); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return py::cast(torch::distributed::rpc::PyRRef( c10::static_intrusive_pointer_cast( ivalue.toRRef()))); diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 119b6b5e5de7..ba94d33f37b3 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -916,7 +916,7 @@ std::shared_ptr toSugaredValue( } else if ( obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) { return SpecialFormValue::create(prim::annotate); -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on. } else if ( obj.ptr() == diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index f61e2597447f..337fe66c0789 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -23,7 +23,7 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include using torch::distributed::autograd::DistAutogradContainer; #endif @@ -267,7 +267,7 @@ void insertLastUses(Graph& g) { } inline int64_t getDistAutogradContextId() { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return DistAutogradContainer::currentContextId(); #else return 0; @@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState( : pImpl(std::move(pImpl_)) {} void InterpreterContinuation::operator()() { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED auto prev_dist_id = DistAutogradContainer::currentContextId(); DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_); #endif @@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() { } else { state.runAsync(stack); } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED DistAutogradContainer::forceCurrentContextId(prev_dist_id); #endif } diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 2bc9abea8c57..6f911f4246cc 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include #endif #include @@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { "this class."; AT_ERROR(err.str()); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED TORCH_CHECK( torch::distributed::rpc::getAllowJitRRefPickle() == true, "RRef jit pickling is only allowed inside RPC calls."); @@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) { } } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED void Pickler::pushRRef(const IValue& ivalue) { // It is the same as how rref is pickled in python, see PyRRef::pickle auto rrefInterface = ivalue.toRRef(); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 9b8fce0b4869..c416f9641023 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include #endif #include @@ -549,7 +549,7 @@ void Unpickler::readGlobal( stack_.emplace_back(int64_t(globals_.size() - 1)); return; } else if (module_name == "torch.distributed.rpc" && class_name == "rref") { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return rebuildRRef(); #else TORCH_INTERNAL_ASSERT( @@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) { }); } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED void Unpickler::rebuildRRef() { globals_.emplace_back([this] { // It is the same as how rref is unpickled in python, diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h index 093d043ecf7d..6d672ee86cd5 100644 --- a/torch/csrc/utils/future.h +++ b/torch/csrc/utils/future.h @@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception { // Most implementation is copied from FutureMessage and // c10::ivalue::Future template -class TORCH_PYTHON_API Future final { +class TORCH_API Future final { public: Future() = default; diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py index 4545aea2bf56..292634580aab 100644 --- a/torch/distributed/rendezvous.py +++ b/torch/distributed/rendezvous.py @@ -6,12 +6,9 @@ import torch._six as six import numbers import os -import sys -from . import FileStore +from . import FileStore, TCPStore from .constants import default_pg_timeout -if sys.platform != 'win32': - from . import TCPStore _rendezvous_handlers = {} @@ -93,10 +90,6 @@ def _error(msg): result = urlparse(url) path = result.path - if sys.platform == 'win32': - import urllib.request - path = urllib.request.url2pathname(result.path) - if not path: raise _error("path missing") query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) @@ -182,8 +175,7 @@ def _env_error(var): # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using env:// method") -if sys.platform != 'win32': - register_rendezvous_handler("tcp", _tcp_rendezvous_handler) - register_rendezvous_handler("env", _env_rendezvous_handler) register_rendezvous_handler("file", _file_rendezvous_handler) +register_rendezvous_handler("tcp", _tcp_rendezvous_handler) +register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 4b206f380111..68fe49f411f5 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -45,16 +45,15 @@ endfunction() set(C10D_SRCS FileStore.cpp + HashStore.cpp ProcessGroup.cpp + ProcessGroupRoundRobin.cpp Store.cpp PrefixStore.cpp + TCPStore.cpp Utils.cpp ) -if(NOT WIN32) - list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp) -endif() - set(C10D_LIBS torch) if(USE_C10D_NCCL) @@ -78,17 +77,14 @@ endif() add_library(c10d STATIC ${C10D_SRCS}) set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET c10d PROPERTY CXX_STANDARD 14) - -if(NOT MSVC) - target_compile_options(c10d PUBLIC - -Wall - -Wextra - -Wno-unused-parameter - -Wno-missing-field-initializers - -Wno-write-strings - -Wno-unknown-pragmas - ) -endif() +target_compile_options(c10d PUBLIC + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + ) add_dependencies(c10d torch) @@ -122,19 +118,17 @@ if(USE_C10D_GLOO) endif() copy_header(FileStore.hpp) +copy_header(HashStore.hpp) copy_header(PrefixStore.hpp) copy_header(ProcessGroup.hpp) copy_header(Store.hpp) +copy_header(TCPStore.hpp) copy_header(Types.hpp) copy_header(Utils.hpp) if(USE_GLOO) copy_header(ProcessGroupGloo.hpp) copy_header(GlooDeviceFactory.hpp) endif() -if(NOT WIN32) - copy_header(HashStore.hpp) - copy_header(TCPStore.hpp) -endif() if(USE_C10D_NCCL) copy_header(ProcessGroupNCCL.hpp) diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp index eb25c52f787a..55346e0fa635 100644 --- a/torch/lib/c10d/FileStore.cpp +++ b/torch/lib/c10d/FileStore.cpp @@ -3,16 +3,9 @@ #include #include #include -#include - -#ifdef _WIN32 -#include -#include -#include -#else #include +#include #include -#endif #include #include @@ -28,40 +21,6 @@ throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \ } -#ifdef _WIN32 -#define LOCK_EX 0x00000001 -#define LOCK_SH 0x00000010 -#define LOCK_UN 0x00000100 - -int flock_(int fd, int op) { - HANDLE hdl = (HANDLE) _get_osfhandle(fd); - DWORD low = 1, high = 0; - OVERLAPPED offset = {0, 0, 0, 0, NULL}; - - if (hdl < 0) - return -1; - - switch (op) { - case LOCK_EX: - if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset)) - return 0; - break; - case LOCK_SH: - if (LockFileEx(hdl, 0, 0, low, high, &offset)) - return 0; - break; - case LOCK_UN: - if(UnlockFileEx(hdl, 0, low, high, &offset) != 0) - return 0; - break; - default: - break; - } - errno = EINVAL; - return -1; -} -#endif - namespace c10d { namespace { @@ -120,11 +79,7 @@ class Lock { int fd_{-1}; void flock(int operation) { -#ifdef _WIN32 - auto rv = syscall(std::bind(::flock_, fd_, operation)); -#else auto rv = syscall(std::bind(::flock, fd_, operation)); -#endif SYSASSERT(rv, "flock"); } }; @@ -137,11 +92,7 @@ class File { std::chrono::milliseconds timeout) { const auto start = std::chrono::steady_clock::now(); while (true) { -#ifdef _WIN32 - fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE)); -#else fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644)); -#endif // Only retry when the file doesn't exist, since we are waiting for the // file to be created in this case to address the following issue: // https://github.com/pytorch/pytorch/issues/13750 diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp index dca6b03eb9dd..70c3c2bb7a31 100644 --- a/torch/lib/c10d/GlooDeviceFactory.cpp +++ b/torch/lib/c10d/GlooDeviceFactory.cpp @@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING( #if GLOO_HAVE_TRANSPORT_TCP static std::shared_ptr<::gloo::transport::Device> makeTCPDevice( - const std::string& interfaceName, + const std::string& interface, const std::string& hostname) { TORCH_CHECK( - !interfaceName.empty() || !hostname.empty(), + !interface.empty() || !hostname.empty(), "GlooDeviceFactory::makeTCPDevice(): interface or hostname " "can't be empty"); ::gloo::transport::tcp::attr attr; - if (!interfaceName.empty()) { - attr.iface = interfaceName; + if (!interface.empty()) { + attr.iface = interface; } else { attr.hostname = hostname; } @@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice); #if GLOO_HAVE_TRANSPORT_UV static std::shared_ptr<::gloo::transport::Device> makeUVDevice( - const std::string& interfaceName, + const std::string& interface, const std::string& hostname) { TORCH_CHECK( - !interfaceName.empty() || !hostname.empty(), + !interface.empty() || !hostname.empty(), "GlooDeviceFactory::makeUVDevice(): interface or hostname " "can't be empty"); ::gloo::transport::uv::attr attr; - if (!interfaceName.empty()) { - attr.iface = interfaceName; + if (!interface.empty()) { + attr.iface = interface; } else { attr.hostname = hostname; } @@ -81,28 +81,23 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice( // the flexibility of other application to override by priority. Register // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override. C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice); -C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice); C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice); #endif static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT"); std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: - makeDeviceForInterface(const std::string& interfaceName) { + makeDeviceForInterface(const std::string& interface) { if (glooDeviceTransport) { - return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, ""); + return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, ""); } #ifdef __linux__ - return GlooDeviceRegistry()->Create("LINUX", interfaceName, ""); + return GlooDeviceRegistry()->Create("LINUX", interface, ""); #endif #ifdef __APPLE__ - return GlooDeviceRegistry()->Create("APPLE", interfaceName, ""); -#endif - -#ifdef _WIN32 - return GlooDeviceRegistry()->Create("WIN32", interfaceName, ""); + return GlooDeviceRegistry()->Create("APPLE", interface, ""); #endif throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device"); @@ -122,10 +117,6 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: return GlooDeviceRegistry()->Create("APPLE", "", hostname); #endif -#ifdef _WIN32 - return GlooDeviceRegistry()->Create("WIN32", "", hostname); -#endif - throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device"); } diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index c139ac7a34fd..531fe751f1c9 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -2,16 +2,10 @@ #include -#ifdef _WIN32 -#include -#include -#include -#else #include #include -#include -#endif #include +#include #include @@ -42,36 +36,6 @@ #include #include -#ifdef _WIN32 -#define GENERATE_ALL_TYPES(type, func, ...) \ - switch (type) { \ - case ::at::ScalarType::Float: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Double: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Half: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Char: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Byte: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Int: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Long: \ - func(__VA_ARGS__); \ - break; \ - default: \ - throw std::runtime_error("Invalid scalar type"); \ - } - -#define HOST_NAME_MAX 256 -#else #define GENERATE_ALL_TYPES(type, func, args...) \ switch (type) { \ case ::at::ScalarType::Float: \ @@ -98,7 +62,6 @@ default: \ throw std::runtime_error("Invalid scalar type"); \ } -#endif namespace c10d { @@ -446,19 +409,12 @@ ProcessGroupGloo::Options::Options() namespace { -void socketInitialize() { -#ifdef _WIN32 - ::gloo::init_winsock(); -#endif -} - // Gloo assumes that this machine's hostname can always be resolved // to an address. If it doesn't it throws a runtime error saying // that it can't be resolved. Instead of catching it, we choose // to proactively check if an address can be resolved, so we can // gracefully fall back to an alternative if it doesn't. bool doesHostnameResolveToUsableAddress(const std::string& hostname) { - socketInitialize(); struct addrinfo hints; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; @@ -475,11 +431,7 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { continue; } rv = bind(fd, rp->ai_addr, rp->ai_addrlen); -#ifdef _WIN32 - closesocket(fd); -#else close(fd); -#endif if (rv == -1) { continue; } @@ -491,11 +443,14 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { } // namespace +#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: - createDeviceForInterface(const std::string& interface_name) { - return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name); + createDeviceForInterface(const std::string& interface) { + return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface); } +#endif +#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDeviceForHostname(const std::string& hostname) { TORCH_CHECK( @@ -505,14 +460,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: " to a (local) address"); return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname); } +#endif -#if defined(__linux__) || defined(_WIN32) +#ifdef __linux__ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDefaultDevice() { // Use the hostname to resolve the network address to // use. Note: if the hostname does not resolve to an address (e.g. // because of misconfigured /etc/hosts file), this will not work. - socketInitialize(); std::array hostname{}; auto rv = gethostname(hostname.data(), HOST_NAME_MAX); if (rv != 0) { diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp index 6c6e941ef95d..d975f6eb6bc5 100644 --- a/torch/lib/c10d/Utils.cpp +++ b/torch/lib/c10d/Utils.cpp @@ -1,6 +1,5 @@ #include -#ifndef _WIN32 #include #include @@ -355,6 +354,6 @@ std::tuple accept( return std::make_tuple( socket, sockaddrToString(reinterpret_cast(&addr))); } + } // namespace tcputil } // namespace c10d -#endif diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 1116cd39ba1c..1bdaddde9f24 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -1,8 +1,6 @@ #pragma once -#ifndef _WIN32 #include -#endif #include #include @@ -482,7 +480,6 @@ class ResourceGuard { bool released_; }; -#ifndef _WIN32 namespace tcputil { constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1); @@ -612,5 +609,4 @@ std::tuple accept( const std::chrono::milliseconds& timeout = kNoTimeout); } // namespace tcputil -#endif } // namespace c10d diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index 003f56f30861..8429d1099b29 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -8,19 +8,14 @@ function(c10d_add_test test_src) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) - target_link_libraries(${test_name} ${ARGN}) - if(NOT WIN32) - target_link_libraries(${test_name} pthread) - target_compile_options(${test_name} PRIVATE -Wno-error) - endif() + target_link_libraries(${test_name} pthread ${ARGN}) + target_compile_options(${test_name} PRIVATE -Wno-error) add_test(NAME ${test_name} COMMAND $) endfunction() c10d_add_test(FileStoreTest.cpp c10d gtest_main) -if(NOT WIN32) - c10d_add_test(HashStoreTest.cpp c10d gtest_main) - c10d_add_test(TCPStoreTest.cpp c10d gtest_main) -endif() +c10d_add_test(HashStoreTest.cpp c10d gtest_main) +c10d_add_test(TCPStoreTest.cpp c10d gtest_main) if(USE_CUDA) if(USE_C10D_GLOO) @@ -34,7 +29,7 @@ if(USE_CUDA) endif() else() if(USE_C10D_GLOO) - c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main) + c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main) endif() endif() diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp index 328da2faf648..defaff895a18 100644 --- a/torch/lib/c10d/test/CUDATest.hpp +++ b/torch/lib/c10d/test/CUDATest.hpp @@ -5,15 +5,9 @@ namespace c10d { namespace test { -#ifdef _WIN32 -#define EXPORT_TEST_API __declspec(dllexport) -#else -#define EXPORT_TEST_API -#endif +void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); -EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); - -EXPORT_TEST_API int cudaNumDevices(); +int cudaNumDevices(); } // namespace test } // namespace c10d diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp index cc8da6326091..77215f4521c2 100644 --- a/torch/lib/c10d/test/FileStoreTest.cpp +++ b/torch/lib/c10d/test/FileStoreTest.cpp @@ -1,8 +1,6 @@ #include -#ifndef _WIN32 #include -#endif #include #include @@ -12,11 +10,6 @@ #include #include -#ifdef _WIN32 -std::string tmppath() { - return c10d::test::autoGenerateTmpFilePath(); -} -#else std::string tmppath() { const char* tmpdir = getenv("TMPDIR"); if (tmpdir == nullptr) { @@ -36,7 +29,6 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } -#endif void testGetSet(std::string path, std::string prefix = "") { // Basic Set/Get on File Store diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index da4f9b5fc106..6606e553e733 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -1,10 +1,7 @@ -#ifndef _WIN32 #include +#include #include #include -#endif - -#include #include #include @@ -24,7 +21,6 @@ using namespace c10d::test; constexpr auto kSendDelay = std::chrono::milliseconds(100); constexpr auto kWaitTimeout = std::chrono::milliseconds(1); -#ifndef _WIN32 class SignalTest { public: SignalTest(const std::string& path) : path_(path) {} @@ -96,7 +92,6 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( test.arm(fork.pid, signal); return test.run(0, 2); } -#endif class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo { public: @@ -461,7 +456,6 @@ void testRecv(const std::string& path) { EXPECT_TRUE(recvCompleted); } -#ifndef _WIN32 TEST(ProcessGroupGlooTest, testSIGSTOPException) { // test SIGSTOP // Fork() and TSAN don't play well together, so skip the test if we're testing @@ -491,7 +485,6 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) { EXPECT_FALSE(work->isSuccess()); EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception); } -#endif TEST(ProcessGroupGlooTest, testAllReduceCPU) { { diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp index 5f5dfca315cb..c62695485573 100644 --- a/torch/lib/c10d/test/TestUtils.hpp +++ b/torch/lib/c10d/test/TestUtils.hpp @@ -1,12 +1,9 @@ #pragma once -#ifndef _WIN32 #include +#include #include #include -#endif - -#include #include #include @@ -40,28 +37,6 @@ class Semaphore { std::condition_variable cv_; }; -#ifdef _WIN32 -std::string autoGenerateTmpFilePath() { - char tmp[L_tmpnam_s]; - errno_t err; - err = tmpnam_s(tmp, L_tmpnam_s); - if (err != 0) - { - throw std::system_error(errno, std::system_category()); - } - return std::string(tmp); -} - -std::string tmppath() { - const char* tmpfile = getenv("TMPFILE"); - if (tmpfile) { - return std::string(tmpfile); - } - else { - return autoGenerateTmpFilePath(); - } -} -#else std::string tmppath() { // TMPFILE is for manual test execution during which the user will specify // the full temp file path using the environmental variable TMPFILE @@ -88,7 +63,6 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } -#endif bool isTSANEnabled() { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); @@ -106,7 +80,6 @@ struct TemporaryFile { } }; -#ifndef _WIN32 struct Fork { pid_t pid; @@ -128,7 +101,6 @@ struct Fork { return pid == 0; } }; -#endif } // namespace test } // namespace c10d diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index b2cd30c66812..f8e5b4822bd8 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -16,7 +16,7 @@ import torch.distributed as c10d from functools import partial, reduce -from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA +from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM class TestSkip(NamedTuple): exit_code: int @@ -143,23 +143,10 @@ def wrapper(*args, **kwargs): return wrapper -def skip_if_win32(): - return unittest.skipIf( - sys.platform == 'win32', - "This unit test case is not supportted on Windows platform", - ) - TIMEOUT_DEFAULT = 100 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400} -def create_device(interface=None): - if sys.platform == 'win32' or interface is None: - return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1") - else: - return c10d.ProcessGroupGloo.create_device(interface=interface) - - def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) @@ -219,7 +206,7 @@ def initialize_temp_directories(init_method=None): if init_method is not None: os.environ["INIT_METHOD"] = init_method else: - os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join( + os.environ["INIT_METHOD"] = "file://" + os.path.join( init_dir_path, "shared_init_file" ) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 36434ff8aa2f..9959551031ff 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -53,10 +53,6 @@ torch.backends.disable_global_flags() -FILE_SCHEMA = "file://" -if sys.platform == 'win32': - FILE_SCHEMA = "file:///" - IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle' class ProfilingMode(Enum): diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py index 93de304a53ca..b88765211df1 100644 --- a/torch/testing/_internal/dist_utils.py +++ b/torch/testing/_internal/dist_utils.py @@ -7,7 +7,6 @@ import torch.distributed as dist import torch.distributed.rpc as rpc from torch.distributed.rpc import _rref_context_get_debug_info # type: ignore[attr-defined] -from torch.testing._internal.common_utils import FILE_SCHEMA if not dist.is_available(): @@ -15,7 +14,7 @@ sys.exit(0) -INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}" +INIT_METHOD_TEMPLATE = "file://{file_name}" def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True, diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py index 09db831e9999..1b1f755ed4cc 100644 --- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py +++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py @@ -20,7 +20,7 @@ skip_if_lt_x_gpu, skip_if_rocm, ) -from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE +from torch.testing._internal.dist_utils import dist_init from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import ( RpcAgentTestFixture, ) @@ -329,7 +329,7 @@ def _remote_worker_process(self): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -346,7 +346,7 @@ def _trainer_process(self, rank: int): ) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool): gLogger.info("Running the master process...") dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index af5e648f6acb..f6f2b9a6fbfb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1,4 +1,5 @@ import copy +import fcntl import itertools import random import math @@ -21,7 +22,6 @@ import torch.nn as nn import torch.nn.functional as F from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember -from torch.testing._internal.common_utils import FILE_SCHEMA from torch.testing._internal.common_distributed import ( MultiProcessTestCase, TEST_SKIPS, @@ -43,10 +43,6 @@ except ImportError: HAS_TORCHVISION = False -if sys.platform == 'win32': - import msvcrt -else: - import fcntl class Foo: def __init__(self, x): @@ -195,17 +191,10 @@ def _lock(): lockfile = os.path.join(TEMP_DIR, "lockfile") with open(lockfile, "w") as lf: try: - if sys.platform == 'win32': - msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1) - yield - else: - fcntl.flock(lf.fileno(), fcntl.LOCK_EX) - yield + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield finally: - if sys.platform == 'win32': - msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) - else: - fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) lf.close() @@ -281,7 +270,7 @@ def tearDown(self): @property def init_method(self): - return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name) + return "file://{file_name}".format(file_name=self.file_name) @classmethod def _run(cls, rank, test_name, file_name): @@ -2173,13 +2162,8 @@ def _test_DDP_5iter( # save the model in the middle and reload if test_save and idx == 2 and INIT_METHOD.startswith("file://"): with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) with tempfile.TemporaryFile() as tmp_file: torch.save(model_DDP, tmp_file) @@ -2208,13 +2192,8 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # dummy data initialization local_bs = len(gpu_subset) @@ -2371,13 +2350,8 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # data initialization input_cpu = torch.randn(global_bs, 2) From bdf329ef8a256f2157aae86a5be28109c2589eb4 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Thu, 24 Sep 2020 22:49:17 -0700 Subject: [PATCH 125/449] SyncBN: preserve qconfig if it exists (#45317) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45317 Eager mode quantization depends on the presence of the `config` model attribute. Currently converting a model to use `SyncBatchNorm` removes the qconfig - fixing this. This is important if a BN is not fused to anything during quantization convert. Test Plan: ``` python test/test_quantization.py TestDistributed.test_syncbn_preserves_qconfig ``` Imported from OSS Reviewed By: jerryzh168 Differential Revision: D23922072 fbshipit-source-id: cc1bc25c8e5243abb924c6889f78cf65a81be158 --- test/quantization/test_workflow_module.py | 15 +++++++++++++++ torch/nn/modules/batchnorm.py | 14 ++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 817e54460e07..5068a6fe7fd4 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -1536,6 +1536,21 @@ def forward(self, x): isinstance(fused_model.conv.bn, nn.SyncBatchNorm), "Expected BN to be converted to SyncBN") + def test_syncbn_preserves_qconfig(self): + """ + Makes sure that if a BatchNorm is not fused and a qconfig exists, + convering the module to SyncBatchNorm preserves the qconfig. + """ + m = nn.Sequential( + nn.Conv2d(1, 1, 1), + nn.BatchNorm2d(1), + ) + m[1].qconfig = torch.quantization.default_qconfig + m = torch.nn.SyncBatchNorm.convert_sync_batchnorm(m) + self.assertTrue( + hasattr(m[1], "qconfig"), + "missing qconfig after SyncBatchNorm conversion") + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @override_qengines diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py index 075311870439..f5ca6deb5b19 100644 --- a/torch/nn/modules/batchnorm.py +++ b/torch/nn/modules/batchnorm.py @@ -114,7 +114,7 @@ def forward(self, input: Tensor) -> Tensor: else: # use exponential moving average exponential_average_factor = self.momentum - r""" + r""" Decide whether the mini-batch stats should be used for normalization rather than the buffers. Mini-batch stats are used in training mode, and in eval mode when buffers are None. """ @@ -185,7 +185,7 @@ class BatchNorm1d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -258,7 +258,7 @@ class BatchNorm2d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -332,7 +332,7 @@ class BatchNorm3d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -414,7 +414,7 @@ class SyncBatchNorm(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` process_group: synchronization of stats happen within each process group @@ -493,7 +493,7 @@ def forward(self, input: Tensor) -> Tensor: else: # use exponential moving average exponential_average_factor = self.momentum - r""" + r""" Decide whether the mini-batch stats should be used for normalization rather than the buffers. Mini-batch stats are used in training mode, and in eval mode when buffers are None. """ @@ -576,6 +576,8 @@ def convert_sync_batchnorm(cls, module, process_group=None): module_output.running_mean = module.running_mean module_output.running_var = module.running_var module_output.num_batches_tracked = module.num_batches_tracked + if hasattr(module, "qconfig"): + module_output.qconfig = module.qconfig for name, child in module.named_children(): module_output.add_module(name, cls.convert_sync_batchnorm(child, process_group)) del module From 95df8657c94492ff026112f8e51a24216f1a9a0c Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 24 Sep 2020 23:07:38 -0700 Subject: [PATCH 126/449] Enables test linalg (#45278) Summary: Fixes https://github.com/pytorch/pytorch/issues/45271. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45278 Reviewed By: ngimel Differential Revision: D23926124 Pulled By: mruberry fbshipit-source-id: 26692597f9a1988e5fa846f97b8430c3689cac27 --- test/run_test.py | 1 + test/test_linalg.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index d63fc372f9c2..b24a20c60f46 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -41,6 +41,7 @@ 'test_foreach', 'test_indexing', 'test_jit', + 'test_linalg', 'test_logging', 'test_mkldnn', 'test_multiprocessing', diff --git a/test/test_linalg.py b/test/test_linalg.py index c81b4dc37582..3dbf31497b77 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -5,7 +5,7 @@ from math import inf, nan, isnan from torch.testing._internal.common_utils import \ - (TestCase, run_tests, TEST_NUMPY) + (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack) from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args @@ -56,11 +56,12 @@ def test_det(self, device, dtype): # NOTE: det requires a 2D+ tensor t = torch.randn(1, device=device, dtype=dtype) - with self.assertRaises(IndexError): + with self.assertRaises(RuntimeError): op(t) # This test confirms that torch.linalg.norm's dtype argument works # as expected, according to the function's documentation + @skipCUDAIfNoMagma def test_norm_dtype(self, device): def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype): msg = ( @@ -154,6 +155,7 @@ def run_test_case(input, p, dim, keepdim): # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that # their matrix norm results match + @skipCUDAIfNoMagma @unittest.skipIf(not TEST_NUMPY, "NumPy not found") @dtypes(torch.float, torch.double) def test_norm_matrix(self, device, dtype): @@ -400,6 +402,8 @@ def gen_error_message(input_size, ord, keepdim, dim=None): # Test that linal.norm gives the same result as numpy when inputs # contain extreme values (inf, -inf, nan) + @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!") + @unittest.skipIf(IS_MACOS, "Skipped on MacOS!") @skipCUDAIfNoMagma @skipCPUIfNoLapack @unittest.skipIf(not TEST_NUMPY, "Numpy not found") @@ -440,14 +444,14 @@ def is_broken_matrix_norm_case(ord, x): result_n = np.linalg.norm(x_n, ord=ord) if is_broken_matrix_norm_case(ord, x): - self.assertNotEqual(result, result_n, msg=msg) + continue else: self.assertEqual(result, result_n, msg=msg) # Test degenerate shape results match numpy for linalg.norm vector norms @skipCUDAIfNoMagma @skipCPUIfNoLapack - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + @unittest.skipIf(TEST_WITH_ASAN, "Skipped on ASAN since it checks for undefined behavior.") @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_norm_vector_degenerate_shapes(self, device, dtype): def run_test_case(input, ord, dim, keepdim, should_error): From 99e0a87bbb4faa6bb539c0eedf323d79fdd8cfcf Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Thu, 24 Sep 2020 23:11:38 -0700 Subject: [PATCH 127/449] [nvFuser] Latency improvements for pointwise + reduction fusion (#45218) Summary: A lot of changes are in this update, some highlights: - Added Doxygen config file - Split the fusion IR (higher level TE like IR) from kernel IR (lower level CUDA like IR) - Improved latency with dynamic shape handling for the fusion logic - Prevent recompilation for pointwise + reduction fusions when not needed - Improvements to inner dimension reduction performance - Added input -> kernel + kernel launch parameters cache, added eviction policy - Added reduction fusions with multiple outputs (still single reduction stage) - Fixed code generation bugs for symbolic tiled GEMM example - Added thread predicates to prevent shared memory form being loaded multiple times - Improved sync threads placements with shared memory and removed read before write race - Fixes to FP16 reduction fusions where output would come back as FP32 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45218 Reviewed By: ezyang Differential Revision: D23905183 Pulled By: soumith fbshipit-source-id: 12f5ad4cbe03e9a25043bccb89e372f8579e2a79 --- aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | 1 + caffe2/CMakeLists.txt | 4 + test/cpp/jit/test_gpu.cpp | 1925 ++++++++++--- test/cpp/jit/tests.h | 21 +- test/test_jit_cuda_fuser.py | 99 +- test/test_jit_cuda_fuser_legacy.py | 6 + test/test_jit_cuda_fuser_profiling.py | 6 + tools/build_variables.bzl | 4 + torch/csrc/jit/codegen/cuda/codegen.cpp | 640 +++++ torch/csrc/jit/codegen/cuda/codegen.h | 22 + torch/csrc/jit/codegen/cuda/compute_at.cpp | 65 +- torch/csrc/jit/codegen/cuda/compute_at.h | 4 +- torch/csrc/jit/codegen/cuda/docs/.gitignore | 1 + .../jit/codegen/cuda/docs/documentation.h | 23 + .../csrc/jit/codegen/cuda/docs/fuser.doxygen | 2515 +++++++++++++++++ .../cuda/docs/images/ir_architecture.png | Bin 0 -> 96754 bytes torch/csrc/jit/codegen/cuda/docs/main_page.md | 8 + torch/csrc/jit/codegen/cuda/executor.cpp | 395 ++- torch/csrc/jit/codegen/cuda/executor.h | 86 +- .../jit/codegen/cuda/executor_kernel_arg.cpp | 2 +- .../jit/codegen/cuda/executor_kernel_arg.h | 8 + .../jit/codegen/cuda/executor_launch_params.h | 5 + .../csrc/jit/codegen/cuda/executor_utils.cpp | 217 +- torch/csrc/jit/codegen/cuda/executor_utils.h | 17 +- .../csrc/jit/codegen/cuda/expr_evaluator.cpp | 219 +- torch/csrc/jit/codegen/cuda/expr_evaluator.h | 86 +- torch/csrc/jit/codegen/cuda/fusion.cpp | 153 +- torch/csrc/jit/codegen/cuda/fusion.h | 56 +- torch/csrc/jit/codegen/cuda/graph_fuser.cpp | 7 + torch/csrc/jit/codegen/cuda/index_compute.cpp | 186 +- .../csrc/jit/codegen/cuda/instrumentation.cpp | 71 + torch/csrc/jit/codegen/cuda/instrumentation.h | 93 + torch/csrc/jit/codegen/cuda/interface.cpp | 1 + torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp | 22 +- torch/csrc/jit/codegen/cuda/ir_base_nodes.h | 1 + torch/csrc/jit/codegen/cuda/ir_cloner.cpp | 76 - torch/csrc/jit/codegen/cuda/ir_cloner.h | 23 - torch/csrc/jit/codegen/cuda/ir_graphviz.cpp | 50 - torch/csrc/jit/codegen/cuda/ir_graphviz.h | 7 - .../jit/codegen/cuda/ir_interface_nodes.h | 9 +- .../csrc/jit/codegen/cuda/ir_internal_nodes.h | 12 + torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 840 +----- torch/csrc/jit/codegen/cuda/ir_iostream.h | 108 +- torch/csrc/jit/codegen/cuda/ir_nodes.cpp | 377 ++- torch/csrc/jit/codegen/cuda/ir_printer.h | 54 +- torch/csrc/jit/codegen/cuda/iter_visitor.cpp | 104 +- torch/csrc/jit/codegen/cuda/iter_visitor.h | 71 +- torch/csrc/jit/codegen/cuda/kernel.cpp | 147 +- torch/csrc/jit/codegen/cuda/kernel.h | 121 +- torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 384 ++- torch/csrc/jit/codegen/cuda/kernel_cache.h | 111 +- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 315 +-- torch/csrc/jit/codegen/cuda/kernel_ir.h | 215 +- .../jit/codegen/cuda/kernel_ir_builder.cpp | 104 + .../csrc/jit/codegen/cuda/kernel_ir_builder.h | 81 + .../codegen/cuda/kernel_resource_strings.h | 77 +- torch/csrc/jit/codegen/cuda/lower2device.cpp | 191 +- torch/csrc/jit/codegen/cuda/lower2device.h | 41 +- torch/csrc/jit/codegen/cuda/lower_index.cpp | 137 +- torch/csrc/jit/codegen/cuda/lower_index.h | 8 + .../jit/codegen/cuda/lower_insert_syncs.cpp | 227 ++ .../jit/codegen/cuda/lower_insert_syncs.h | 51 + torch/csrc/jit/codegen/cuda/lower_loops.cpp | 157 +- torch/csrc/jit/codegen/cuda/lower_loops.h | 69 +- .../codegen/cuda/lower_thread_predicate.cpp | 68 +- .../jit/codegen/cuda/lower_thread_predicate.h | 32 +- torch/csrc/jit/codegen/cuda/lower_unroll.cpp | 20 +- torch/csrc/jit/codegen/cuda/lower_unroll.h | 8 +- torch/csrc/jit/codegen/cuda/lower_utils.cpp | 52 +- .../jit/codegen/cuda/lower_validation.cpp | 5 +- torch/csrc/jit/codegen/cuda/manager.cpp | 62 +- torch/csrc/jit/codegen/cuda/parser.cpp | 42 +- torch/csrc/jit/codegen/cuda/partition.cpp | 5 + .../jit/codegen/cuda/predicate_compute.cpp | 76 +- .../csrc/jit/codegen/cuda/predicate_compute.h | 8 +- torch/csrc/jit/codegen/cuda/scheduler.cpp | 483 ++-- torch/csrc/jit/codegen/cuda/scheduler.h | 32 +- .../csrc/jit/codegen/cuda/shape_inference.cpp | 12 +- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 21 +- torch/csrc/jit/codegen/cuda/transform_iter.h | 2 + .../jit/codegen/cuda/transform_replay.cpp | 7 + .../jit/codegen/cuda/transform_rfactor.cpp | 5 + torch/csrc/jit/codegen/cuda/type.h | 8 + torch/csrc/jit/codegen/cuda/utils.h | 15 + 84 files changed, 8911 insertions(+), 3188 deletions(-) create mode 100644 torch/csrc/jit/codegen/cuda/codegen.cpp create mode 100644 torch/csrc/jit/codegen/cuda/codegen.h create mode 100644 torch/csrc/jit/codegen/cuda/docs/.gitignore create mode 100644 torch/csrc/jit/codegen/cuda/docs/documentation.h create mode 100644 torch/csrc/jit/codegen/cuda/docs/fuser.doxygen create mode 100644 torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png create mode 100644 torch/csrc/jit/codegen/cuda/docs/main_page.md create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.cpp create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.h create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.h diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 4630465115c7..00e57ca63520 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -42,6 +42,7 @@ namespace at { namespace cuda { _(nvrtcGetProgramLog) \ _(nvrtcGetLoweredName) \ _(cuModuleLoadData) \ + _(cuModuleLoadDataEx) \ _(cuModuleGetFunction) \ _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \ _(cuGetErrorString) \ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 65f072b6f29d..6ea848bd32e5 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -506,6 +506,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp @@ -515,6 +516,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/instrumentation.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -524,7 +526,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir_builder.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_insert_syncs.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 80fa318d653a..d18becfa6641 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -73,11 +75,11 @@ TensorView* makeTensorWithContig( } void checkIntValue( - const EvaluationContext* eval_context, + StatefulExpressionEvaluator& evaluator, Val* val, Int::ScalarType expected_value) { TORCH_CHECK(val->isAnInt()); - const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context); + const auto actual_value = evaluator.inferValue(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } @@ -162,16 +164,16 @@ void testGPU_FusionExprEvalConstants() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(7); auto* b = new Int(3); - checkIntValue(&eval_context, neg(a), -7); - checkIntValue(&eval_context, add(a, b), 10); - checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); + checkIntValue(evaluator, neg(a), -7); + checkIntValue(evaluator, add(a, b), 10); + checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); } // Evaluate basic scalar operations with bound values @@ -179,7 +181,7 @@ void testGPU_FusionExprEvalBindings() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(); auto* b = new Int(); @@ -188,35 +190,35 @@ void testGPU_FusionExprEvalBindings() { auto* e = new Int(0); // trying to evaluate before binding should give empty results - TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value()); - TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value()); + TORCH_CHECK(!evaluator.inferValue(a).has_value()); + TORCH_CHECK(!evaluator.inferValue(d).has_value()); - eval_context.bind(a, 7); - eval_context.bind(b, 3); + evaluator.safeBind(a, 7); + evaluator.safeBind(b, 3); // can't bind to the results of expressions - ASSERT_ANY_THROW(eval_context.bind(c, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(c, 100)); // can't bind to concrete values - ASSERT_ANY_THROW(eval_context.bind(e, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(e, 100)); - checkIntValue(&eval_context, c, 10); - checkIntValue(&eval_context, sub(a, b), 4); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); - checkIntValue(&eval_context, d, -4); + checkIntValue(evaluator, c, 10); + checkIntValue(evaluator, sub(a, b), 4); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); + checkIntValue(evaluator, d, -4); // Reset evaluation context - eval_context = EvaluationContext(&fusion); + evaluator = StatefulExpressionEvaluator(&fusion); - eval_context.bind(a, 2); - eval_context.bind(b, 5); + evaluator.safeBind(a, 2); + evaluator.safeBind(b, 5); - checkIntValue(&eval_context, c, 7); - checkIntValue(&eval_context, sub(a, b), -3); - checkIntValue(&eval_context, mod(a, b), 2); - checkIntValue(&eval_context, ceilDiv(a, b), 1); - checkIntValue(&eval_context, d, -2); + checkIntValue(evaluator, c, 7); + checkIntValue(evaluator, sub(a, b), -3); + checkIntValue(evaluator, mod(a, b), 2); + checkIntValue(evaluator, ceilDiv(a, b), 1); + checkIntValue(evaluator, d, -2); } // Evaluate expressions in a simple IR @@ -247,8 +249,8 @@ void testGPU_FusionExprEvalBasic() { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values // @@ -258,21 +260,21 @@ void testGPU_FusionExprEvalBasic() { // (ex. `tv0->getRootDomain()[0]->extent()` // instead of `tv0->axis(0)->extent()`) // - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); } // Evaluate expressions in a more complex IR @@ -298,33 +300,33 @@ void testGPU_FusionExprEvalComplex() { tv6->split(0, 5); tv5->merge(0); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 129); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 127); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127); // Evaluate and check extent values TORCH_CHECK(tv0->domain()->nDims() == 2); - checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127); TORCH_CHECK(tv3->domain()->nDims() == 2); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127); TORCH_CHECK(tv4->domain()->nDims() == 2); - checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127); TORCH_CHECK(tv5->domain()->nDims() == 1); - checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383); + checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383); TORCH_CHECK(tv6->domain()->nDims() == 3); - checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26); - checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5); - checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127); + checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26); + checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5); + checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127); } // Evaluate expressions post lowering @@ -360,31 +362,29 @@ void testGPU_FusionExprEvalPostLower() { // Lower GpuLower gpulw(&fusion); - std::stringstream kernel; - gpulw.printKernel(kernel); // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); - checkIntValue(&eval_context, bid_x, 2); - checkIntValue(&eval_context, tid_x, 128); + checkIntValue(evaluator, bid_x, 2); + checkIntValue(evaluator, tid_x, 128); } void testGPU_FusionClear() { @@ -505,10 +505,12 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion - std::stringstream original_kernel; + std::string original_kernel; { - GpuLower lower(&original_fusion); - lower.printKernel(original_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&original_fusion); + original_kernel = + codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated @@ -529,12 +531,14 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels - std::stringstream clone_kernel; + std::string clone_kernel; { - GpuLower lower(&before_lowering); - lower.printKernel(clone_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&before_lowering); + clone_kernel = + codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } - ASSERT_EQ(original_kernel.str(), clone_kernel.str()); + ASSERT_EQ(original_kernel, clone_kernel); } void testGPU_FusionMove() { @@ -593,9 +597,7 @@ void testGPU_FusionMove() { ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR - std::stringstream kernel; GpuLower lower(&another_fusion); - lower.printKernel(kernel); std::stringstream lowered_ir; lowered_ir << another_fusion; @@ -799,48 +801,6 @@ void testGPU_FusionTensor() { } } - { - auto tensor = at::randn({2, 1, 4}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - } - - { - auto tensor = at::randn({2, 3, 1}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]); - } - // TensorType::create fills stride_properties, which helps us to mark // IterDomain properly // Note: implementation could change, depending on how much we want to invest @@ -1156,43 +1116,36 @@ void testGPU_FusionParser() { // 1. this can be moved to a dedicated "golden" file // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3){ - float T2[4]; - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; +__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { + float T2[1]; + if ((((((blockIdx.x * 1) + (1 - 1)) * 128) + threadIdx.x) < T0.size[0])) { + for(size_t i6 = 0; i6 < 1; ++i6) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } else { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; + for(size_t i6 = 0; i6 < 1; ++i6) { + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } - } - } - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; - } - } else { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } } } )"; - std::string actual_kernel = GpuLower(fusion.get()).getKernel(); - actual_kernel = "\n" + actual_kernel; + const std::string actual_kernel = + "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr @@ -1576,11 +1529,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t5)); TORCH_CHECK(at::allclose(outputs[1], t6)); } @@ -1636,11 +1585,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {kernel_tv3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); } // Case 4 @@ -1706,11 +1651,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1, t2, t3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t6)); } // Case 5 @@ -1752,176 +1693,715 @@ void testGPU_FusionAdvancedComputeAt() { } } -void testGPU_FusionScalarInputs() { +void testGPU_FusionComputeAtMultiConsumers() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 Fusion fusion; FusionGuard fg(&fusion); - TensorView* tv0 = makeDummyTensor(2); + TensorView* tv0 = makeDummyTensor(1); fusion.addInput(tv0); - TensorView* tv1 = makeDummyTensor(2); - fusion.addInput(tv1); - Float* f0 = new Float(); - fusion.addInput(f0); - Float* f1 = new Float(); - fusion.addInput(f1); - Float* f2 = new Float(); - fusion.addInput(f2); - Float* f3 = new Float(); - fusion.addInput(f3); - Val* f4 = mul(f0, f1); - Val* f5 = sub(f2, f3); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + fusion.addOutput(tv2); + fusion.addOutput(tv3); - TensorView* tv2 = sub(tv1, f4); - TensorView* tv3 = add(tv0, f5); - TensorView* tv4 = mul(tv3, tv2); + // This computeAt will affect tv2 as well, even though tv2 is not in + // the data-flow path between tv1 and tv3. The reason is that tv1 is + // now computed at tv3, so tv2 must also be computed at the same + // location. Overall, what will happen is basically we merge + // expressions of all tensors and compute them in a single loop + // nest. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + // Note that tv2 is also computed at tv3. + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + TORCH_CHECK(!tv3->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + + at::Tensor kernel_tv2 = at::empty_like(t0, options); + at::Tensor kernel_tv3 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv2, kernel_tv3}); + + TORCH_CHECK(at::allclose(kernel_tv2, t2)); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); +} +// Similar to ComputeAtMultiConsumers, but with a common consumer. +void testGPU_FusionComputeAtCommonConsumer1() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + fusion.addOutput(tv3); fusion.addOutput(tv4); + fusion.addOutput(tv5); - // Lets setup to actually run - while (tv4->nDims() > 1) - tv4->merge(0); - tv4->split(0, 128); - tv4->split(0, 4); + // Computing tv1 at tv3. This will affect tv2 as discussed in + // ComplexComputeAt1. Additionally, in this case, notice that tv4 is + // the common consumer of tv2 and tv3, so they are computed at + // tv4. The indirect propagation of the computeAt should stop at the + // common consumer, and no further change should occur. More + // specifically, tv4 and tv5 should not have a computeAt tensor. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(!tv4->hasComputeAt()); + TORCH_CHECK(!tv5->hasComputeAt()); - tv4->axis(0)->parallelize(ParallelType::BIDx); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); +} + +void testGPU_FusionComputeAtCommonConsumer2() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv3 + tv4 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + + fusion.addOutput(tv5); + + TensorView* computeAtTarget = tv3; + + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + // This computeAt will affect all tensors including tv3, tv4 and + // tv5, even though it appears to impact only tv1 and tv2. The + // reason is that tv1 is now computed at tv3, so tv4 must also be + // computed at the same location. Similarly, the consumer of tv4, + // tv5, must also be computed at the same location. Overall, what + // will happen is basically we merge expressions of all tensors and + // compute them in a single loop nest. Internally, this will be + // realized by making all tensors, except for those in the path + // between tv1 and tv3, computed at tv5, which we call the common + // consumer. + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(!tv5->hasComputeAt()); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - + TensorView* tv = val->as(); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } - // f4 = f0 * f1 - // f5 = f2 - f3 - // t2 = t1 - f4 - // t3 = t0 + f5 - // t4 = t3 * t2 - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - float fl0 = 0.1; - float fl1 = -0.2; - float fl2 = 0.3; - float fl3 = -0.4; - float fl4 = fl0 * fl1; - float fl5 = fl2 - fl3; - at::Tensor t0 = at::randn({129, 127}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.sub(fl4); - auto t3 = t0.add(fl5); - auto t4 = t3.mul(t2); - at::Tensor kernel_tv4 = at::empty_like(t0, options); + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; - at::Scalar test(fl0); + at::Tensor kernel_tv5 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - fe.runFusion( - {t0, - t1, - at::Scalar(fl0), - at::Scalar(fl1), - at::Scalar(fl2), - at::Scalar(fl3)}, - {kernel_tv4}); - - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); + fe.runFusion({t0}, {kernel_tv5}); - TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); } -void testGPU_FusionLoopUnroll() { +// Similar to the above common consumer test but adds an additional +// tensor that has no common consumer with the other tensors. +void testGPU_FusionComputeAtCommonConsumer3() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv2 + tv3 + // tv6 = tv1 + 6 Fusion fusion; FusionGuard fg(&fusion); - // Set up your input tensor views - TensorView* tv0 = makeDummyTensor(3); - TensorView* tv1 = makeDummyTensor(3); - - // Register your inputs + TensorView* tv0 = makeDummyTensor(2); fusion.addInput(tv0); - fusion.addInput(tv1); - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Float(2.0)); - TensorView* tv3 = add(tv0, tv2); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + TensorView* tv6 = add(tv1, new Float(6.0)); - // Register your outputs - fusion.addOutput(tv3); + fusion.addOutput(tv5); + fusion.addOutput(tv6); - int block_size = 16; + TensorView* computeAtTarget = tv3; - tv3->merge(0, 1); - tv3->merge(0, 1); + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); - tv3->split(0, block_size); - tv3->split(0, 4); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); + // This will have the same impact on the tensors except for tv5 and + // tv6. tv6 does not have any common consumer with the computeAt + // target, but since it uses tv1, it must be also computed at the + // same location as the other impacted tensors. We can either make + // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 + // should be computed at tv6 just because the current implementation + // orders the computeAt relationship based on the order in which + // tensors are specified as outputs. - // Parallelize - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + + // tv5 should be computed at tv6 since tv5 is added as an output + // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be + // computed at tv5. + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = val->as(); + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::rand({129, 13, 3}, options); - at::Tensor input1 = at::rand({129, 13, 3}, options); + at::Tensor t0 = at::randn({129, 127}, options); + + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; + auto t6 = t1.add({6.0}); + + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input0, input1}); + fe.runFusion({t0}, {kernel_tv5, kernel_tv6}); - TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); } -/* - * Helper function for single op testing that generates a codegen operand - */ +// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor +// that does not have data dependency with the consumer. +void testGPU_FusionComputeAtNoCommonConsumer() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv1 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + // tv6 = tv1 * 6 + Fusion fusion; + FusionGuard fg(&fusion); -Val* gen_jit_operand(std::pair desc) { - if (desc.first == ValType::TensorView) { - return makeDummyTensor(2, desc.second); - } else if (desc.first == ValType::Scalar) { - if (desc.second == DataType::Float) - return new Float(); - else if (desc.second == DataType::Int) - return new Int(); - else - TORCH_CHECK("Not currently supported type", desc.first); - } else { - TORCH_CHECK("Not currently supported type", desc.first); + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + // Notice that tv6 is not a consumer of tv4. + TensorView* tv6 = mul(tv1, new Float(6.0)); + fusion.addOutput(tv3); + fusion.addOutput(tv4); + fusion.addOutput(tv5); + fusion.addOutput(tv6); + + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); } - return nullptr; -} -/* - * Helper function for single op testing that generates an ATen operand - */ + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); -IValue gen_aten_operand( - std::pair desc, - int blocks, + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + auto t6 = t1 * 6.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); +} + +namespace { + +void checkConcretized( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_concretize) { + if (should_concretize) { + TORCH_CHECK( + IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } else { + TORCH_CHECK( + !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } +} + +} // namespace + +void testGPU_FusionBCastConcretizeBasic() { + Fusion fusion; + FusionGuard fg(&fusion); + + // tv0: [I I] + TensorView* tv0 = makeDummyTensor(2); + + // tv1: [I I I] + TensorView* tv1 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + + // tv2*: [B I I] + auto tv2_0 = broadcast(tv0, {true, false, false}); + auto tv2_1 = broadcast(tv0, {true, false, false}); + auto tv2 = add(tv2_0, tv2_1); + + // tv3: [I I I] + auto tv3 = add(tv2, tv1); + + fusion.addOutput(tv3); + + checkConcretized(tv2, 0, tv1, 0, true); + checkConcretized(tv2_0, 0, tv1, 0, true); + checkConcretized(tv2_1, 0, tv1, 0, true); + checkConcretized(tv2_0, 1, tv1, 0, false); + checkConcretized(tv2_0, 0, tv1, 1, false); +} + +void testGPU_FusionBCastConcretizeRfactor() { + Fusion fusion; + FusionGuard fg(&fusion); + + // both tv0 and tv1 = [I, I] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + //[B,I,I] + auto tv2 = broadcast(tv1, {true, false, false}); + + //[B,I,R] + auto tv3 = sum(tv2, {2}); + + auto tv5 = add(tv3, tv1); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv3->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv4 = tv3->rFactor({3}); + + checkConcretized(tv2, 0, tv5, 0, true); + checkConcretized(tv4, 0, tv5, 0, true); + checkConcretized(tv3, 0, tv5, 0, true); +} + +namespace { + +void checkIdProvedEquivalent( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_prove) { + if (should_prove) { + TORCH_CHECK(IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } else { + TORCH_CHECK(!IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } +} + +} // namespace + +void testGPU_FusionProveIdEqBasic() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv3 = broadcast(tv0, {true, false, false}); + auto tv4 = broadcast(tv1, {false, true, false}); + auto tv5 = add(tv3, tv4); + fusion.addOutput(tv5); + + checkIdProvedEquivalent(tv0, 0, tv4, 1, true); + checkIdProvedEquivalent(tv1, 0, tv4, 0, true); + checkIdProvedEquivalent(tv1, 1, tv0, 1, true); + checkIdProvedEquivalent(tv0, 0, tv5, 1, true); + checkIdProvedEquivalent(tv1, 1, tv5, 2, true); + checkIdProvedEquivalent(tv0, 0, tv1, 0, false); + checkIdProvedEquivalent(tv0, 1, tv1, 0, false); + checkIdProvedEquivalent(tv0, 0, tv1, 1, false); +} + +void testGPU_FusionProveIdEqRfactor() { + Fusion fusion; + FusionGuard fg(&fusion); + + // [I,I] + TensorView* tv0 = makeDummyTensor(2); + // [I,I,I] + TensorView* tv1 = makeDummyTensor(3); + + //[I,I,R] + auto tv2 = sum(tv1, {2}); + + auto tv5 = add(tv2, tv0); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv2->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv3 = tv2->rFactor({3}); + + checkIdProvedEquivalent(tv1, 0, tv0, 0, true); + checkIdProvedEquivalent(tv2, 0, tv0, 0, true); + checkIdProvedEquivalent(tv3, 0, tv0, 0, true); +} + +void testGPU_FusionScalarInputs() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + TensorView* tv1 = makeDummyTensor(2); + fusion.addInput(tv1); + + Float* f0 = new Float(); + fusion.addInput(f0); + Float* f1 = new Float(); + fusion.addInput(f1); + Float* f2 = new Float(); + fusion.addInput(f2); + Float* f3 = new Float(); + fusion.addInput(f3); + Val* f4 = mul(f0, f1); + Val* f5 = sub(f2, f3); + + TensorView* tv2 = sub(tv1, f4); + TensorView* tv3 = add(tv0, f5); + TensorView* tv4 = mul(tv3, tv2); + + fusion.addOutput(tv4); + + // Lets setup to actually run + while (tv4->nDims() > 1) + tv4->merge(0); + tv4->split(0, 128); + tv4->split(0, 4); + + tv0->computeAt(tv4, 1); + tv1->computeAt(tv4, 1); + + tv4->axis(0)->parallelize(ParallelType::BIDx); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = static_cast(val); + + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } + + // f4 = f0 * f1 + // f5 = f2 - f3 + // t2 = t1 - f4 + // t3 = t0 + f5 + // t4 = t3 * t2 + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + float fl0 = 0.1; + float fl1 = -0.2; + float fl2 = 0.3; + float fl3 = -0.4; + float fl4 = fl0 * fl1; + float fl5 = fl2 - fl3; + + at::Tensor t0 = at::randn({129, 127}, options); + at::Tensor t1 = at::rand_like(t0, options); + + auto t2 = t1.sub(fl4); + auto t3 = t0.add(fl5); + auto t4 = t3.mul(t2); + + at::Tensor kernel_tv4 = at::empty_like(t0, options); + + at::Scalar test(fl0); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion( + {t0, + t1, + at::Scalar(fl0), + at::Scalar(fl1), + at::Scalar(fl2), + at::Scalar(fl3)}, + {kernel_tv4}); + + TORCH_CHECK(at::allclose(kernel_tv4, t4)); +} + +void testGPU_FusionLoopUnroll() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3); + TensorView* tv1 = makeDummyTensor(3); + + // Register your inputs + fusion.addInput(tv0); + fusion.addInput(tv1); + + // Do math with it, it returns a `Val*` but can be static_casted back to + // TensorView + TensorView* tv2 = add(tv1, new Float(2.0)); + TensorView* tv3 = add(tv0, tv2); + + // Register your outputs + fusion.addOutput(tv3); + + int block_size = 16; + + tv3->merge(0, 1); + tv3->merge(0, 1); + + tv3->split(0, block_size); + tv3->split(0, 4); + + // For all inputs, computeAt the output inline, temporaries should be squeezed + // between them + tv0->computeAt(tv3, 1); + tv1->computeAt(tv3, 1); + + // Parallelize + tv2->axis(1)->parallelize(ParallelType::Unroll); + tv3->axis(1)->parallelize(ParallelType::Unroll); + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(0)->parallelize(ParallelType::BIDx); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor input0 = at::rand({129, 13, 3}, options); + at::Tensor input1 = at::rand({129, 13, 3}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({input0, input1}); + + TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); +} + +/* + * Helper function for single op testing that generates a codegen operand + */ + +Val* gen_jit_operand(std::pair desc) { + if (desc.first == ValType::TensorView) { + return makeDummyTensor(2, desc.second); + } else if (desc.first == ValType::Scalar) { + if (desc.second == DataType::Float) + return new Float(); + else if (desc.second == DataType::Int) + return new Int(); + else + TORCH_CHECK("Not currently supported type", desc.first); + } else { + TORCH_CHECK("Not currently supported type", desc.first); + } + return nullptr; +} + +/* + * Helper function for single op testing that generates an ATen operand + */ + +IValue gen_aten_operand( + std::pair desc, + int blocks, int threads, bool rand) { if (desc.first == ValType::TensorView) { @@ -2012,7 +2492,7 @@ void test_op( gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); std::vector output_vect = {output}; cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); torch::jit::fuser::cuda::FusionExecutor fe; @@ -2020,7 +2500,7 @@ void test_op( fe.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); at::Tensor ref_output = af(aten_inputs); cudaDeviceSynchronize(); // This sync shouldn't be necessary; @@ -2054,12 +2534,8 @@ void test_op( op_str, " -- had a mismatch.", aten_inputs_to_str(), - "\nJIT: ", - output, - "\nREF: ", - ref_output, - "\nDIFF: ", - diff, + "\nABS MAX DIFF: ", + output.sub(ref_output).abs().max(), "\n"); } @@ -2385,14 +2861,8 @@ void testGPU_FusionCastOps() { "\nOp Type: -- ", "cast FP16->FP32->FP16", " -- had a mismatch.\n", - "IN1 : ", - input1, - "\n", - "JIT: ", - outputs[0], - "\n", - "REF: ", - ref_output, + "\nABS MAX DIFF: ", + outputs[0].sub(ref_output).abs().max(), "\n"); } @@ -3453,10 +3923,6 @@ void testGPU_FusionAdvancedIndexing() { FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); auto tv0 = makeDummyTensor(3); auto tv1 = makeDummyTensor(4); @@ -3465,10 +3931,42 @@ void testGPU_FusionAdvancedIndexing() { auto tv2 = add(tv0, new Float(1.0)); auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); - fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({x, y, z}, options); + at::Tensor t1 = at::randn({w, x, y, z}, options); + + fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + auto t2 = t0.add(1.0); + auto t3 = t2.add(t1); + + TORCH_CHECK(t3.allclose(outputs[0])); + } + + { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeConcreteTensor({10, 20}); + fusion.addInput(tv0); + TensorView* tv1 = makeConcreteTensor({10, 10, 20}); + fusion.addInput(tv1); + + TensorView* tv2 = add(tv0, new Float(1)); + TensorView* tv3 = broadcast(tv2, {true, false, false}); + TensorView* tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({10, 20}, options); + at::Tensor t1 = at::randn({10, 10, 20}, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); @@ -4624,23 +5122,21 @@ void testGPU_FusionReductionScheduler() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand({bid_x, tid_x}, options); + at::Tensor input = at::randn({bid_x, tid_x}, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); cuda::FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({red_dim}); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -4685,9 +5181,9 @@ void testGPU_FusionSymbolicReduction() { // How many threads to use for the block reduction int runtime_threadIdx_dim = 128; - torch::jit::fuser::cuda::FusionExecutor executor; - executor.compileFusion(&fusion); - auto outputs = executor.runFusion( + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( {input}, torch::jit::fuser::cuda::LaunchParams( -1, -1, -1, runtime_threadIdx_dim, -1, -1)); @@ -4716,24 +5212,22 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); at::Tensor cg_output = at::empty(tensor_dims_out, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -4758,26 +5252,26 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); - TORCH_CHECK( - cuda::scheduleReduction(&fusion, {input}, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-05, 1e-05), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } void testGPU_FusionReductionSchedulerDimShmoo() { - std::vector fp16_usage = {false}; + std::vector fp16_usage = {true, false}; std::vector red_axis = {1, 0}; std::vector output_dims = {320, 640}; std::vector red_dims; @@ -4821,40 +5315,31 @@ void testGPU_FusionReductionSchedulerDimShmoo() { .dtype((fp16 ? at::kHalf : at::kFloat)) .device(at::kCUDA, 0); at::Tensor input = - (axis ? at::rand({odim, rdim}, options) - : at::rand({rdim, odim}, options)); - - const at::ArrayRef inputs({input}); + (axis ? at::randn({odim, rdim}, options) + : at::randn({rdim, odim}, options)); - c10::optional rparams = - cuda::scheduleReduction(&fusion, inputs, tv1); - TORCH_CHECK(rparams != c10::nullopt, "Reduction is not found!"); + std::vector outputs_of_red; if (fp16) { - if (axis == 0) { - int tidx = rparams.value().lparams.bdimx(); - tv1_cast->split(-1, tidx); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDx); - tv1_cast->axis(-2)->parallelize(ParallelType::BIDx); - } else { - if (rparams.value().mul_reds_per_blk) { - int tidy = rparams.value().lparams.bdimy(); - tv1_cast->split(0, tidy); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDy); - } - tv1_cast->axis(0)->parallelize(ParallelType::BIDx); - } + outputs_of_red.push_back(tv1_cast); } + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), tv1, outputs_of_red); + torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto cg_output = fe.runFusion({input}); + auto outputs = + fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({axis}); TORCH_CHECK( - aten_output.allclose(cg_output[0]), + aten_output.allclose(outputs[0], 1e-03, 1e-03), "Error of: ", - aten_output.sub(cg_output[0]).abs().max()); + aten_output.sub(outputs[0]).abs().max()); } } } @@ -5203,6 +5688,7 @@ void testGPU_FusionSmem() { aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); } void testGPU_FusionSmemReduce() { @@ -5245,61 +5731,314 @@ void testGPU_FusionSmemReduce() { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); + auto outputs = fe.runFusion({t0}); + + at::Tensor aten_output = sum(t0, {1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); +} + +void testGPU_FusionSmemBlockGemm() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + constexpr int BSX = 16; + tv5->split(2, BSX); + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv6 = tv5->rFactor({-1}); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + + tv0->computeAt(tv5, 3); + tv1->computeAt(tv5, 3); + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + tv6->axis(-3)->parallelize(ParallelType::TIDy); + tv6->axis(-2)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemBlockGemmCache() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + // Remove reduction axis from tv5 + // tv6 = (M, R, N) + // tv5 = (M, N) + TensorView* tv6 = tv5->cache_before(); + + constexpr int BSX = 16; + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); + // tv5 = M/BSX, N/BSX, MSX, NSX + + tv6->computeAt(tv5, 2); + tv6->computeAt(tv5, 2); + + tv6->split(-1, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv7 = tv6->rFactor({-1}); + // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr + // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + + tv0->computeAt(tv6, 3); + tv1->computeAt(tv6, 3); + + tv0->computeAt(tv7, 3); + tv1->computeAt(tv7, 3); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + tv7->setMemoryType(MemoryType::Shared); + // Memory Type + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + + tv7->axis(-3)->parallelize(ParallelType::TIDy); + tv7->axis(-2)->parallelize(ParallelType::TIDx); + + tv6->axis(-2)->parallelize(ParallelType::TIDy); + tv6->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemDynamicReductionSymbolic() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Shared); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemDynamicReductionSymbolicArg() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + Int* sym_bsx = new Int(); + TensorView* tv0 = makeDummyTensor(3); // M, K, N + fusion.addInput(tv0); + fusion.addInput(sym_bsx); + + TensorView* tv1 = sum(tv0, {1}); // M, R, N + fusion.addOutput(tv1); + + TensorView* tv2 = tv0->cache_after(); + tv2->setMemoryType(MemoryType::Shared); + + // Schedule + constexpr int BSX = 32; + tv1->split(2, BSX); + tv1->split(1, sym_bsx); + tv1->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); + TensorView* tv3 = tv1->rFactor({-2}); + + tv0->computeAt(tv1, -2); + tv0->computeAt(tv3, -2); + + // Thread and Block binding + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::BIDy); + tv1->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K, N}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {t0, runtime_threadIdx_dim}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); at::Tensor aten_output = sum(t0, {1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); } -void testGPU_FusionSmemBlockGemm() { +void testGPU_FusionSmemDynamicPwiseMulSymbolicArgWAR() { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm + Int* sym_bsx = new Int(); TensorView* tv0 = makeDummyTensor(2); // (M, K) TensorView* tv1 = makeDummyTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N fusion.addInput(tv0); fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Schedule - constexpr int BSX = 16; - tv5->split(2, BSX); - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv6 = tv5->rFactor({-1}); + fusion.addInput(sym_bsx); + fusion.addOutput(tv4); + // Algorithm tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv0->computeAt(tv5, 3); - tv1->computeAt(tv5, 3); + constexpr int BSX = 32; + tv4->split(2, BSX); + tv4->split(1, sym_bsx); + tv4->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); + tv0->computeAt(tv4, 3); + tv1->computeAt(tv4, 3); + // Schedule + + tv4->axis(0)->parallelize(ParallelType::BIDx); + tv4->axis(2)->parallelize(ParallelType::BIDy); // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-3)->parallelize(ParallelType::TIDy); - tv6->axis(-2)->parallelize(ParallelType::TIDx); + // Thread and Block binding - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 128, K = 457, N = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); @@ -5307,100 +6046,231 @@ void testGPU_FusionSmemBlockGemm() { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); + auto outputs = fe.runFusion( + {t0, t1, BSX}, + torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1)); - at::Tensor aten_output = matmul(t0, t1); + at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1); } -void testGPU_FusionSmemBlockGemmCache() { -#if 0 +void testGPU_FusionSmemDynamicTiledGemm() { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm - TensorView* tv0 = makeDummyTensor(2); // (M, K) - TensorView* tv1 = makeDummyTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N + // Symbolic integers we will use for runtime tiling + Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z + Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x + Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x + // Compile-time integer for tiling + int n_smem_tile = 8; // bound to threadIdx.y + + // Symbolic 2D tensors TV0[M, K], TV1[K, N] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + // Broadcast tv0 to [M, K, *] + TensorView* tv2 = broadcast(tv0, {false, false, true}); + // Broadcast tv1 to [*, K, N] + TensorView* tv3 = broadcast(tv1, {true, false, false}); + + // Pointwise multiplication resulting in tv3[M, K, N] + TensorView* tv4 = mul(tv2, tv3); + + // Turn the K-dimension of tv4 into a reduction dimension + TensorView* tv5 = sum(tv4, {1}); + + // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); - // Schedule - // Remove reduction axis from tv5 - // tv6 = (M, R, N) - // tv5 = (M, N) - TensorView* tv6 = tv5->cache_before(); + // Register runtime tile dims as inputs + fusion.addInput(symbolic_m_tile_dim); + fusion.addInput(symbolic_split_k_tile_dim); + fusion.addInput(symbolic_block_k_tile_dim); - constexpr int BSX = 16; - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // tv5 = M/BSX, N/BSX, MSX, NSX + // Make a 3D tile, mix of symbolic and constant, do in reverse order because + // dims are inserted + tv5->split(2, n_smem_tile); + tv5->split(1, symbolic_block_k_tile_dim); + tv5->split(1, symbolic_split_k_tile_dim); + tv5->split(0, symbolic_m_tile_dim); - tv6->computeAt(tv5, 2); + // Reorder so all outer tiles are in the leftmost 3 positions + tv5->reorder({{1, 5}, {5, 1}}); + + // Factor out the outer reduction IterDomain, then run the inter-cta + // reduction, and intra-cta reduction + auto tv6 = tv5->rFactor({2}); + + // Scope computations tv6->computeAt(tv5, 2); - tv6->split(-1, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv7 = tv6->rFactor({-1}); - // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr - // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + // RFactor moves reduction axes around, reorder to match ordering of tv5 + tv6->reorder({ + {2, -2}, + {3, -1}, + {4, 2}, + {5, 3}, + {6, 4}, + }); + // Setup compute at schedule tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); + tv4->computeAt(tv6, -1); + // + // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) + // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) + // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] + // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] + // T5[ Mo, No, rKoi, rKii, Mi, Ni] - tv0->computeAt(tv7, 3); - tv1->computeAt(tv7, 3); - + // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv7->setMemoryType(MemoryType::Shared); - // Memory Type + tv4->setMemoryType(MemoryType::Local); + tv6->setMemoryType(MemoryType::Local); - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv7->axis(-3)->parallelize(ParallelType::TIDy); - tv7->axis(-2)->parallelize(ParallelType::TIDx); + std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; + for (auto tv : tv_list) { + tv->axis(-2)->parallelize(ParallelType::TIDz); + tv->axis(-1)->parallelize(ParallelType::TIDy); + } + tv2->axis(3)->parallelize(ParallelType::TIDx); + tv3->axis(3)->parallelize(ParallelType::TIDx); + tv4->axis(3)->parallelize(ParallelType::TIDx); + tv6->axis(3)->parallelize(ParallelType::TIDx); + tv5->axis(2)->parallelize(ParallelType::TIDx); - tv6->axis(-2)->parallelize(ParallelType::TIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(4)->parallelize(ParallelType::BIDx); + tv3->axis(4)->parallelize(ParallelType::BIDx); + tv4->axis(4)->parallelize(ParallelType::BIDx); + tv6->axis(4)->parallelize(ParallelType::BIDx); + tv5->axis(3)->parallelize(ParallelType::BIDx); - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 31, K = 65, N = 33; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); + at::Tensor A = at::randn({M, K}, options); + at::Tensor B = at::randn({K, N}, options); torch::jit::fuser::cuda::FusionExecutor fe; + // Generate CUDA and compile with nvRTC fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); - at::Tensor aten_output = matmul(t0, t1); + // Runtime tiling + int m_tile = 4; // bound to threadIdx.z + int split_k = 7; // bound to blockIdx.x + int intra_cta = 8; // bound to threadIdx.x + + auto fuser_outputs = fe.runFusion({A, B, m_tile, split_k, intra_cta}); + auto C_fuser = fuser_outputs[0]; + + at::Tensor aten_C = mul(A.unsqueeze(2), B.unsqueeze(0)).sum(1); + TORCH_CHECK( + aten_C.allclose(C_fuser, 1e-5, 1e-5), + "Error of: ", + aten_C.sub(C_fuser).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1); +} + +void testGPU_FusionGlobalIntermediate() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Global); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); -#endif +} + +void testGPU_FusionGlobalIntermediateDefaultSchedule() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(2); + TensorView* tv3 = makeDummyTensor(2); + TensorView* tv4 = sub(tv2, tv3); + TensorView* tv5 = add(tv1, tv4); + TensorView* tv6 = sub(tv5, tv0); + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addInput(tv2); + fusion.addInput(tv3); + fusion.addOutput(tv6); + // t6 = ((t1 + (t2 - t3)) - t0) + + tv4->setMemoryType(MemoryType::Global); + tv5->setMemoryType(MemoryType::Global); + tv6->setMemoryType(MemoryType::Global); + + constexpr int M = 32, N = 810; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor in0 = at::rand({M, N}, options); + at::Tensor in1 = at::rand({M, N}, options); + at::Tensor in2 = at::rand({M, N}, options); + at::Tensor in3 = at::rand({M, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({in0, in1, in2, in3}); + + at::Tensor aten_output = (in1 + (in2 - in3)) - in0; + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().sum()); } void testGPU_FusionConstCheck() { @@ -5990,6 +6860,195 @@ void testGPU_FusionThreadPredicate() { TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3)); } +void testGPU_FusionLSTMCell() { + const int hidden_features = 512; + const int batch_size = 64; + + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tvs[16]; + for (size_t i = 0; i < 16; i++) { + tvs[i] = makeDummyTensor(2); + fusion.addInput(tvs[i]); + } + + auto ingate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); + + auto forgetgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); + + auto cellgate = unaryOp( + UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); + + auto outgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); + + auto cx = makeContigTensor(2); + fusion.addInput(cx); + + auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); + + auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); + + fusion.addOutput(cy); + fusion.addOutput(hy); + + std::vector inputs; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor large_tensor0 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor1 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor2 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor3 = + at::randn({batch_size, hidden_features * 4}, options); + + auto chunked0 = large_tensor0.chunk(4, 1); + auto chunked1 = large_tensor1.chunk(4, 1); + auto chunked2 = large_tensor2.chunk(4, 1); + auto chunked3 = large_tensor3.chunk(4, 1); + + inputs.insert(inputs.end(), chunked0.begin(), chunked0.end()); + inputs.insert(inputs.end(), chunked1.begin(), chunked1.end()); + inputs.insert(inputs.end(), chunked2.begin(), chunked2.end()); + inputs.insert(inputs.end(), chunked3.begin(), chunked3.end()); + + auto at_ingate = + chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); + auto at_forgetgate = + chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); + auto at_cellgate = + chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); + auto at_outgate = + chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); + + auto at_cx = at::randn({batch_size, hidden_features}, options); + inputs.push_back(at_cx); + auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); + auto at_hy = at_outgate.mul(at_cy.tanh()); + + fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef(inputs)); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion(c10::ArrayRef(inputs)); + + TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7)); + TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7)); +} + +void testGPU_FusionComputeAtMultiBCast() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = broadcast(tv1, {true, false}); + TensorView* tv3 = broadcast(tv1, {false, true}); + TensorView* tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + // This is not supported and should throw an exception. + ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); +} + +void testGPU_FusionReductionHalf() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3, DataType::Half); + fusion.addInput(tv0); + + auto tv1 = castOp(DataType::Float, tv0); + auto tv2 = add(tv1, new Float(1.0)); + auto tv3 = sum(tv2, {2}); + auto tv4 = castOp(DataType::Half, tv3); + + fusion.addOutput(tv4); + + const auto options = + at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + at::Tensor input = at::randn({8, 8, 16}, options); + + auto reduction_tv = tv3; + + auto outputsOfReduction = DependencyCheck::getAllOutputsOf({reduction_tv}); + + // Grab only tensor views, though there shouldn't be any other type + auto tv_entries = ir_utils::filterByType(outputsOfReduction); + + std::vector tvOutputsOfReduction( + tv_entries.begin(), tv_entries.end()); + + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, reduction_tv); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), reduction_tv, tvOutputsOfReduction); + + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + + cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + // no broadcasting needed, omitting the last optional argument; + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); + + auto aten_output = input.to(c10::ScalarType::Float) + .add(1.0) + .sum({2}) + .to(c10::ScalarType::Half); + + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-04, 1e-04), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +void testGPU_FusionInputsIdLookup() { + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({16, 8, 8}, options); + at::Tensor t1 = at::randn({8, 8}, options); + at::Tensor t2 = at::randn({6, 4}, options); + + // create a cache with max size 2; + auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2); + + // testing basic function, same encoding for identical inputs + auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); + auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); + TORCH_CHECK(id_0.id == id_0_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 1); + TORCH_CHECK(id_0.eviction == false); + + // new input (even tho same shape, but we have different signature because of + // missing scalar input + auto id_1 = inputs_id_lookup.lookupId({t0, t1}); + auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1.id == id_1_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_1.eviction == false); + + // eviction should happen at this point + auto id_2 = inputs_id_lookup.lookupId({t2, t1}); + TORCH_CHECK(id_2.id != id_0.id); + TORCH_CHECK(id_2.id != id_1.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_2.eviction == true); + TORCH_CHECK(id_2.evict_id == id_0.id); + + // look at input 1 again + auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1_relook.id == id_1.id); + TORCH_CHECK(id_1_relook.eviction == false); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 0285559fb8fc..a058326c2050 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -135,7 +135,16 @@ namespace jit { _(GPU_FusionCompoundOps) \ _(GPU_FusionCastOps) \ _(GPU_FusionAdvancedComputeAt) \ + _(GPU_FusionComputeAtMultiConsumers) \ + _(GPU_FusionComputeAtCommonConsumer1) \ + _(GPU_FusionComputeAtCommonConsumer2) \ + _(GPU_FusionComputeAtCommonConsumer3) \ + _(GPU_FusionComputeAtNoCommonConsumer) \ _(GPU_FusionScalarInputs) \ + _(GPU_FusionBCastConcretizeBasic) \ + _(GPU_FusionBCastConcretizeRfactor) \ + _(GPU_FusionProveIdEqBasic) \ + _(GPU_FusionProveIdEqRfactor) \ _(GPU_FusionRFactorReplay) \ _(GPU_FusionReduction) \ _(GPU_FusionReduction2) \ @@ -183,6 +192,12 @@ namespace jit { _(GPU_FusionSmemReduce) \ _(GPU_FusionSmemBlockGemm) \ _(GPU_FusionSmemBlockGemmCache) \ + _(GPU_FusionSmemDynamicReductionSymbolic) \ + _(GPU_FusionSmemDynamicReductionSymbolicArg) \ + _(GPU_FusionSmemDynamicPwiseMulSymbolicArgWAR) \ + _(GPU_FusionSmemDynamicTiledGemm) \ + _(GPU_FusionGlobalIntermediate) \ + _(GPU_FusionGlobalIntermediateDefaultSchedule) \ _(GPU_FusionConstCheck) \ _(GPU_FusionSymbolicReduction) \ _(GPU_FusionUnrollWithAlloc) \ @@ -197,7 +212,11 @@ namespace jit { _(GPU_FusionTraversalOrder6) \ _(GPU_FusionTraversalOrder7) \ _(GPU_FusionBranches) \ - _(GPU_FusionThreadPredicate) + _(GPU_FusionThreadPredicate) \ + _(GPU_FusionLSTMCell) \ + _(GPU_FusionComputeAtMultiBCast) \ + _(GPU_FusionReductionHalf) \ + _(GPU_FusionInputsIdLookup) #else #define TH_FORALL_TESTS_CUDA(_) \ _(GraphExecutor) \ diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 9d61cd5dd157..0c8a1f9a967d 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -550,9 +550,8 @@ def t(x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) # end-2-end test of permutation & contiguity handling in integration. @@ -595,11 +594,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - # numerical issues here due to our scheduling. - # can't use `self.assertEqual(oo, jit_oo)` - self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4)) + self.assertEqual(o.dtype, jit_o.dtype) + # numerical issues here due to our scheduling. + # can't use `self.assertEqual(o, jit_o)` + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -630,6 +628,81 @@ def test_reduction_permutation(self): for perm1 in itertools.permutations(range(len(x))): self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + def test_reduction_multiple_output(self): + torch._C._jit_set_bailout_depth(2) + + def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor): + o = torch.mul(x, y) + o = torch.mul(o, scale) + out1 = torch.mul(o, z) + out2 = torch.sum(out1, dim=[2]) + return out1, out2 + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + scale = 0.5 + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + x = x.to(memory_format=torch.channels_last) + y = y.to(memory_format=torch.channels_last) + z = z.to(memory_format=torch.channels_last) + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_dtype(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2], dtype=torch.float32) + return o + t_jit = torch.jit.script(t) + + x = torch.randn(8, 4, 16, dtype=torch.float, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_half(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2]) + return o + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 16, dtype=torch.float16, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") @@ -651,9 +724,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -676,9 +748,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py index 4b9959c1231e..41e16df7d686 100644 --- a/test/test_jit_cuda_fuser_legacy.py +++ b/test/test_jit_cuda_fuser_legacy.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=legacy") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py index e2869eca7b5f..7559b85519c4 100644 --- a/test/test_jit_cuda_fuser_profiling.py +++ b/test/test_jit_cuda_fuser_profiling.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=profiling") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 174bb858da44..26ab975373a8 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -339,6 +339,7 @@ libtorch_cuda_sources = [ "torch/csrc/autograd/functions/comm.cpp", "torch/csrc/jit/codegen/cuda/arith.cpp", "torch/csrc/jit/codegen/cuda/compute_at.cpp", + "torch/csrc/jit/codegen/cuda/codegen.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", @@ -348,6 +349,7 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/fusion.cpp", "torch/csrc/jit/codegen/cuda/graph_fuser.cpp", "torch/csrc/jit/codegen/cuda/index_compute.cpp", + "torch/csrc/jit/codegen/cuda/instrumentation.cpp", "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp", "torch/csrc/jit/codegen/cuda/ir_cloner.cpp", "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp", @@ -357,8 +359,10 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/kernel.cpp", "torch/csrc/jit/codegen/cuda/kernel_cache.cpp", "torch/csrc/jit/codegen/cuda/kernel_ir.cpp", + "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp", "torch/csrc/jit/codegen/cuda/lower_index.cpp", "torch/csrc/jit/codegen/cuda/lower_loops.cpp", + "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp", "torch/csrc/jit/codegen/cuda/lower_unroll.cpp", "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp", "torch/csrc/jit/codegen/cuda/lower_utils.cpp", diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp new file mode 100644 index 000000000000..f6e791f0edba --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -0,0 +1,640 @@ + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +namespace { + +class CudaKernelGenerator : private OptInConstDispatch { + static constexpr char* kTab = " "; + + public: + static std::string generateKernelDefinition( + const Kernel* kernel, + const std::string& kernel_name) { + CudaKernelGenerator codegen(kernel); + codegen.genDeclaration(kernel_name); + codegen.startBlock(); + codegen.genPrologue(); + codegen.genBody(); + codegen.endBlock(); + TORCH_CHECK(codegen.block_nest_level_ == 0); + return codegen.code_.str(); + } + + private: + explicit CudaKernelGenerator(const Kernel* kernel) : kernel_(kernel) {} + + // Generates the kernel function declaration + void genDeclaration(const std::string& kernel_name) { + const auto& kernel_summary = kernel_->summary(); + + code_ << "__global__ void " << kernel_name << "("; + + std::vector params; + + // Inputs + for (auto val : kernel_->inputs()) { + params.push_back(val); + } + + // Outputs + for (auto val : kernel_->outputs()) { + params.push_back(val); + } + + // Global buffers + for (auto allocate : kernel_summary.global_allocations) { + params.push_back(allocate->buffer()); + } + + // Generate parameter declarations + for (Val* val : params) { + switch (val->getValType().value()) { + case ValType::KirTensorView: { + // TODO(kir): review this + const auto tv = val->as(); + code_ << "Tensor<" << val->getDataType().value() << ", " + << TensorDomain::noReductions( + tv->fuserTv()->getMaybeRFactorDomain()) + .size() + << "> " << gen(tv); + break; + } + case ValType::KirScalar: + code_ << val->getDataType().value() << " " << gen(val); + break; + default: + TORCH_CHECK(!"Unexpected parameter type"); + } + + if (val != params.back()) { + code_ << ", "; + } + } + + // Kernels generating random numbers take extra (seed, offset) arguments + if (kernel_summary.is_stochastic) { + code_ << ", unsigned long long seed, unsigned long long offset"; + } + + code_ << ") "; + } + + // Generates setup code which is executed before the kernel body + void genPrologue() { + const auto& kernel_summary = kernel_->summary(); + + // Random number generator (optional) + if (kernel_summary.is_stochastic) { + indent() << "const int idx = blockIdx.x*blockDim.x + threadIdx.x;\n"; + indent() << "Philox rnd(seed, idx, offset);\n"; + } + + // Do we have any dynamic shared memory buffers? + const bool has_dynamic_smem = + !kernel_summary.dynamic_smem_allocations.empty(); + + // Do we have any reductions? + const bool has_reductions = kernel_summary.has_block_reductions || + kernel_summary.has_grid_reductions; + + // Shared memory + if (has_dynamic_smem || has_reductions) { + indent() << "alignas(" + << dataTypeSize(kernel_summary.largest_smem_data_type) + << ") extern __shared__ char array[];\n"; + + if (has_dynamic_smem) { + indent() << "unsigned offset = 0;\n"; + } + + if (has_reductions) { + indent() << "void* shared_mem = array;\n"; + if (has_dynamic_smem) { + indent() << "offset += " + << "((blockDim.x * blockDim.y * blockDim.z) * sizeof(" + << kernel_summary.largest_smem_data_type << "));\n"; + } + } + } + } + + void genBody() { + for (auto expr : kernel_->topLevelExprs()) { + OptInConstDispatch::handle(expr); + } + } + + void startBlock(bool continuation = false) { + if (continuation) { + code_ << "{\n"; + } else { + indent() << "{\n"; + } + ++block_nest_level_; + } + + void endBlock(const char* sep = "\n") { + --block_nest_level_; + TORCH_CHECK(block_nest_level_ >= 0); + indent() << "}" << sep; + } + + std::ostream& indent() { + for (int i = 0; i < block_nest_level_; ++i) { + code_ << kTab; + } + return code_; + } + + std::string gen(const Statement* stmt) { + std::stringstream tmp_code; + std::swap(tmp_code, code_); + handle(stmt); + std::swap(tmp_code, code_); + return tmp_code.str(); + } + + std::string gen(const kir::TensorView* tv) { + std::stringstream tv_name; + tv_name << "T" << tv->name(); + return tv_name.str(); + } + + std::string genInline(const Statement* stmt) { + const bool saved_inline = print_inline_; + print_inline_ = true; + const auto result = gen(stmt); + print_inline_ = saved_inline; + return result; + } + + void handle(const Statement* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Expr* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Val* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const kir::Bool* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "b" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::Float* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "f" << node->name(); + } else { + const int digits = std::numeric_limits::max_digits10; + code_ << "float(" << std::setprecision(digits) << *node->value() << ")"; + } + } + + void handle(const kir::Half* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "h" << node->name(); + } else { + code_ << "__float2half(" << *node->value() << ")"; + } + } + + void handle(const kir::Int* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "i" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::NamedScalar* node) final { + code_ << node->name(); + } + + void handle(const kir::TensorIndex* node) final { + code_ << gen(node->view()) << "["; + + bool first = true; + for (auto* ind : node->indices()) { + if (!ind->isZeroInt()) { + if (!first) { + code_ << " + "; + } + code_ << genInline(ind); + first = false; + } + } + + if (first) { + code_ << "0"; + } + + code_ << "]"; + } + + void handle(const kir::IterDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorView* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::UnaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar() && !node->in()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + if (auto op = inline_op_str(node->getUnaryOpType())) { + code_ << *op << gen(node->in()); + } else { + if (node->getUnaryOpType() == UnaryOpType::Cast) { + const auto cast_str = + cast_func_str({node->in()->getDataType().value(), + node->out()->getDataType().value()}); + code_ << cast_str.value(); + } else { + code_ << node->getUnaryOpType(); + } + + code_ << "("; + if (node->getUnaryOpType() == UnaryOpType::RandLike) { + code_ << "rnd"; + } else { + code_ << gen(node->in()); + } + code_ << ")"; + } + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genBinaryOp( + BinaryOpType op_type, + const std::string& lhs, + const std::string& rhs) { + std::stringstream expr; + if (auto op = inline_op_str(op_type)) { + expr << lhs << " " << *op << " " << rhs; + } else { + expr << op_type << "(" << lhs << ", " << rhs << ")"; + } + return expr.str(); + } + + void handle(const kir::BinaryOp* node) final { + const auto op_type = node->getBinaryOpType(); + if (print_inline_) { + // Inline expression: `lhs op rhs` + code_ << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + indent() << gen(node->out()); + if (node->out()->isScalar()) { + // Single line: `out = lhs op rhs;` + code_ << " = " + << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + // Split TensorView expressions across multiple lines: + // + // out + // = lhs + // op rhs; + // + if (auto op = inline_op_str(op_type)) { + code_ << "\n"; + indent() << kTab << "= " << gen(node->lhs()) << "\n"; + indent() << kTab << *op << " " << gen(node->rhs()); + } else { + code_ << " = " << op_type << "(\n"; + indent() << kTab << gen(node->lhs()) << ",\n"; + indent() << kTab << gen(node->rhs()) << ")"; + } + } + code_ << ";\n"; + } + } + + void handle(const kir::TernaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + code_ << node->getTernaryOpType() << "(" << gen(node->in1()) << ", " + << gen(node->in2()) << ", " << gen(node->in3()) << ")"; + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genReductionOp(BinaryOpType op_type, DataType data_type) { + std::stringstream lambda; + lambda << "[](" << data_type << " &a, " << data_type << " b) " + << "{ a = " << genBinaryOp(op_type, "a", "b") << "; }"; + return lambda.str(); + } + + void handle(const kir::BroadcastOp* node) final { + const ir_utils::ParallelTypeBitmap domains = + ir_utils::getParallelBroadcastDomains( + node->out(), kernel_->predicateMap()); + + const bool thread_x = domains.get(ParallelType::TIDx); + const bool thread_y = domains.get(ParallelType::TIDy); + const bool thread_z = domains.get(ParallelType::TIDz); + const bool block_x = domains.get(ParallelType::BIDx); + const bool block_y = domains.get(ParallelType::BIDy); + const bool block_z = domains.get(ParallelType::BIDz); + + const bool grid_broadcast_needed = block_x || block_y || block_z; + const bool block_broadcast_needed = thread_x || thread_y || thread_z; + + TORCH_INTERNAL_ASSERT( + !grid_broadcast_needed, + "Parallel broadcast across blocks not supported"); + + if (block_broadcast_needed) { + const auto data_type = node->out()->getDataType().value(); + indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false") + << ", " << (thread_y ? "true" : "false") << ", " + << (thread_z ? "true" : "false") << ">(\n"; + indent() << kTab << gen(node->out()) << ",\n"; + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem));\n"; + } else { + indent() << gen(node->out()) << "\n"; + indent() << kTab << " = " << gen(node->in()) << ";\n"; + } + } + + void handle(const kir::ReductionOp* node) final { + TORCH_CHECK(node->out()->getValType() == ValType::TensorIndex); + + const auto out = node->out()->as(); + const auto domain = out->view()->domain(); + + const bool has_block_reduce = domain->hasBlockReduction(); + const bool has_grid_reduce = domain->hasGridReduction(); + + if (!has_block_reduce && !has_grid_reduce) { + const auto gen_out = gen(out); + const auto op_type = node->getReductionOpType(); + indent() << gen_out << " = " + << genBinaryOp(op_type, gen_out, gen(node->in())) << ";\n"; + return; + } + + const auto par_domains = node->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + + const auto data_type = node->out()->getDataType().value(); + const auto op_type = node->getReductionOpType(); + + if (has_block_reduce) { + if (has_grid_reduce) { + indent() << data_type << " " + << "block_result" + << ";\n"; + } + indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " + << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") + << ">(\n"; + if (has_grid_reduce) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(node->out()) << ",\n"; + } + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "threadIdx,\n"; + indent() << kTab << "blockDim,\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->init()) << ");\n"; + } + } + + void handle(const kir::GridReduction* node) final { + const auto rop = node->reduction_op(); + TORCH_INTERNAL_ASSERT(rop->out()->getValType() == ValType::TensorIndex); + + const auto out = rop->out()->as(); + const auto domain = out->view()->domain(); + TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); + + const auto par_domains = rop->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end(); + const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end(); + const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end(); + + const auto data_type = rop->out()->getDataType().value(); + const auto op_type = rop->getReductionOpType(); + + TORCH_INTERNAL_ASSERT( + node->reduction_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + TORCH_INTERNAL_ASSERT( + node->sync_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + const auto work_buffer = + node->reduction_buffer()->buffer()->as(); + const auto sync_buffer = + node->sync_buffer()->buffer()->as(); + + // Since block-level reduction is already done, those dimensions + // with tidx/y/z being true do not participate in the grid reduction. + indent() << kir::GridReduction::getPredicateFlagName(out->view()) << " = " + << "reduction::gridReduce<" << (bidx ? "true" : "false") << ", " + << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false") + << ", " << (!tidx ? "true" : "false") << ", " + << (!tidy ? "true" : "false") << ", " << (!tidz ? "true" : "false") + << ">(\n"; + indent() << kTab << gen(rop->out()) << ",\n"; + if (domain->hasBlockReduction()) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(rop->in()) << ",\n"; + } + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "&" << gen(work_buffer) << "[0],\n"; + indent() << kTab << gen(sync_buffer) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->reduction_op()->init()) << ");\n"; + } + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Woverloaded-virtual" + // TODO(Kir): fix me + void handle(const kir::Scope& scope) { + for (auto expr : scope.exprs()) { + handle(expr); + } + } +#pragma clang diagnostic pop + + void handle(const kir::ForLoop* node) final { + // TODO(kir): handle this during lowering + if (node->iter_domain()->isThread() || node->iter_domain()->isBroadcast()) { + handle(node->body()); + return; + } + + const auto gen_index = gen(node->index()); + const auto gen_start = genInline(node->iter_domain()->start()); + const auto gen_extent = genInline(node->iter_domain()->extent()); + indent() << "for(size_t " << gen_index << " = " << gen_start << "; " + << gen_index << " < " << gen_extent << "; ++" << gen_index << ") "; + + startBlock(true); + handle(node->body()); + endBlock(); + } + + void handle(const kir::IfThenElse* node) final { + indent() << "if (" << genInline(node->cond()) << ") "; + + // "then" block + startBlock(true); + handle(node->thenBody()); + + // "else" block (optional) + if (node->hasElse()) { + endBlock(" else "); + startBlock(true); + handle(node->elseBody()); + } + + endBlock(); + } + + // TODO(kir): fold initialization into Allocate + void handle(const kir::Allocate* node) final { + if (node->buffer()->getValType().value() != ValType::KirTensorView) { + indent() << node->buffer_type() << " " << gen(node->buffer()) << ";\n"; + return; + } + + const auto tv = node->buffer()->as(); + TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0); + TORCH_INTERNAL_ASSERT(node->size() != nullptr); + + switch (tv->memoryType()) { + case MemoryType::Global: + indent() << "// Allocate global tensor " << gen(tv) << "\n"; + break; + case MemoryType::Shared: + if (node->size()->isConstScalar()) { + // Static shared memory + indent() << "__shared__ " << node->buffer_type() << " " << gen(tv) + << "[" << genInline(node->size()) << "];\n"; + } else { + // Align Offset Position + indent() << "offset = alignBufferSize(offset," + << dataTypeSize(node->buffer_type()) << ");\n"; + // Shared Memory Pointer + indent() << node->buffer_type() << "* " << gen(tv) + << " = reinterpret_cast<" << node->buffer_type() << "*>" + << "(array + offset);\n"; + // Increment Offset Position + indent() << "offset += (" << genInline(node->size()) << " * sizeof(" + << node->buffer_type() << "));\n"; + } + break; + case MemoryType::Local: + indent() << node->buffer_type() << " " << gen(tv) << "[" + << genInline(node->size()) << "];\n"; + break; + default: + TORCH_INTERNAL_ASSERT(false, "Unexpected memory type"); + } + } + + void handle(const kir::Sync* node) final { + indent() << "__syncthreads();\n"; + } + + private: + std::stringstream code_; + const Kernel* kernel_; + int block_nest_level_ = 0; + + // TODO(kir): replace with explicit assignment statements + bool print_inline_ = false; +}; + +} // namespace + +std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name) { + FUSER_PERF_SCOPE("generateCudaKernel"); + return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name); +} + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h new file mode 100644 index 000000000000..562aa1554eb2 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -0,0 +1,22 @@ + +#pragma once + +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +//! Generates a CUDA kernel definition for the given kernel +TORCH_CUDA_API std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name = "CUDAGeneratedKernel"); + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 3e0f5303b966..9f8f7aba1cf4 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -20,11 +21,10 @@ ComputeAtData::ComputeAtData(TensorView* tv) void ComputeAtData::clearPass() { // If the last pass set a position, update the new_compute_at_position if // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - new_compute_at_position = - pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position; + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + new_compute_at_position = current_traversal_position; + } current_traversal_position_set = false; current_traversal_position = 0; @@ -52,16 +52,19 @@ void ComputeAtData::setPassPosition(unsigned int pos) { } unsigned int ComputeAtData::getNewPosition() const { - // If the last pass set a position, update the new_compute_at_position if - // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - return pass_pos > new_compute_at_position ? pass_pos - : new_compute_at_position; + // If the last pass set a position, return the latest position if + // it would be greater than previously set. + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + return current_traversal_position; + } else { + return new_compute_at_position; + } } void ComputeAtData::validateNewComputeAt() const { + FUSER_PERF_SCOPE("validateNewComputeAt"); + TORCH_INTERNAL_ASSERT( getNewPosition() >= original_compute_at_position, "Invalid computeAt detected. This computeAt would invalidate the set computeAt on ", @@ -82,7 +85,22 @@ void ComputeAtData::validateNewComputeAt() const { "."); } +void ComputeAtData::setComputeAtDomain(TensorDomain* td) { + if (new_compute_at_domain_ != original_domain_) { + TORCH_INTERNAL_ASSERT( + *new_compute_at_domain_ == *td, + "TensorDomain, ", + td, + ", does not match with the previously set domain of ", + tv_ref_, + ", which is ", + new_compute_at_domain_); + } + new_compute_at_domain_ = td; +} + namespace { + // Wrapper around set_intersection template std::set set_intersection(const std::set& set1, const std::set& set2) { @@ -121,12 +139,15 @@ std::deque> tvChains( } return tv_chains; } + } // namespace void ComputeAt::run( TensorView* producer, TensorView* consumer, unsigned int consumer_position) { + FUSER_PERF_SCOPE("ComputeAt::run"); + // Make sure the correct fusion is setup between this and consumer. TORCH_CHECK( producer->fusion() == consumer->fusion(), @@ -160,6 +181,9 @@ void ComputeAt::run( // Check all dependency chains, select the next TV after producer towards // consumer. These are the TVs we're going to actually call computeAt on. for (const auto& tv_chain : all_chains) { + // When a chain only has two tensors, they must be the producer, + // which is an input, and the consumer. There is nothing we need + // to do for such chains. if (tv_chain.size() > 2) { // Make sure we only add once, but we want to add in a determinsitic // order @@ -188,6 +212,8 @@ unsigned int ComputeAt::backwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int consumer_compute_at_axis) { + FUSER_PERF_SCOPE("backwardComputeAt_impl"); + auto& producer_entry = tv_data.at(producer); // Use TensorDomain interface so it doesn't set computeAt automatically @@ -209,6 +235,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int producer_compute_at_axis) { + FUSER_PERF_SCOPE("forwardComputeAt_impl"); + auto& consumer_entry = tv_data.at(consumer); const auto& producer_entry = tv_data.at(producer); @@ -229,6 +257,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( } void ComputeAt::setCommonConsumer() { + FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer"); + // Convert the first chain to a set. std::set common_consumers( producer_use_chains_.front().begin(), producer_use_chains_.front().end()); @@ -281,6 +311,8 @@ void ComputeAt::setCommonConsumer() { // Similar to backward traversal in traverseAllKnown but we should only apply // computeAt if it will increase computeAt positions. void ComputeAt::traverseBackward() { + FUSER_PERF_SCOPE("ComputeAt::traverseBackward"); + // propagate *backward* through all *producer* use_chains or from *producer* // to common_consumer if common_consumer exists. Only apply transform if // increases computeAt position. @@ -307,6 +339,8 @@ void ComputeAt::traverseBackward() { } void ComputeAt::traverseForward() { + FUSER_PERF_SCOPE("ComputeAt::traverseForward"); + // propagate forward through all *producer* use_chains or from *producer* to // common_consumer if common_consumer exists. auto chains = producer_use_chains_; @@ -338,6 +372,8 @@ void ComputeAt::traverseForward() { } void ComputeAt::runPass() { + FUSER_PERF_SCOPE("ComputeAt::runPass"); + // Initialize tv_data for all TensorViews we may modify auto chains = producer_use_chains_; if (common_consumer_ != nullptr) { @@ -382,6 +418,8 @@ void ComputeAt::runPass() { } void ComputeAt::setupOutputs() { + FUSER_PERF_SCOPE("ComputeAt::setupOutputs"); + if (common_consumer_ != nullptr) return; @@ -421,9 +459,6 @@ ComputeAt::ComputeAt( : producer_(_producer), consumer_(_consumer), consumer_position_(_consumer_position) { - if (consumer_position_ < 0) - consumer_position_ += consumer_->nDims(); - TORCH_INTERNAL_ASSERT( consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(), "Invalid computeAt axis, received ", diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 84677ae99448..a9112a6225ca 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -56,9 +56,7 @@ class ComputeAtData { // If we set computeAt, save the domain so we can reset it after traversal. // Traversal state can deviate from the domain we will want to save after the // entire computeAt pass. - void setComputeAtDomain(TensorDomain* td) { - new_compute_at_domain_ = td; - } + void setComputeAtDomain(TensorDomain* td); // Return domain set in setComputeAtDomain TensorDomain* getComputeAtDomain() const { diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore new file mode 100644 index 000000000000..1936cc1d441e --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore @@ -0,0 +1 @@ +html diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h new file mode 100644 index 000000000000..cfd4435461b9 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h @@ -0,0 +1,23 @@ + +#error This is used exclusively for generating the documentation (not a real header) + +//! \namespace torch::jit::fuser +//! \brief Main PyTorch JIT Fuser namespace + +//! \namespace torch::jit::fuser::cuda +//! \brief CUDA specific components + +//! \namespace torch::jit::fuser::cuda::executor_utils +//! \brief Fuser executor related utilities + +//! \namespace torch::jit::fuser::kir +//! \brief Kernel IR + +//! \namespace torch::jit::fuser::ir_utils +//! \brief IR manipulation utilities + +//! \namespace torch::jit::fuser::loop_utils +//! \brief Loop utilities + +//! \namespace torch::jit::fuser::scope_utils +//! \brief Scope utilities diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen new file mode 100644 index 000000000000..b9a51b187aa5 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen @@ -0,0 +1,2515 @@ +# Doxyfile 1.8.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. + +PROJECT_NAME = "PyTorch JIT Fuser" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 0 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. + +# TODO: switch to NO once key concepts are documented +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = NO + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT += .. +INPUT += documentation.h +INPUT += main_page.md + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE += + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS += Ui +EXCLUDE_SYMBOLS += internal +EXCLUDE_SYMBOLS += __* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = images + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = main_page.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = --std=c++1z + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: 0. + +CLANG_COMPILATION_DATABASE_PATH = 0 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /