From 03dde4c62af35a8a8a0c2e1ea9f6486ac897a780 Mon Sep 17 00:00:00 2001 From: Dianshi Li Date: Thu, 24 Sep 2020 18:39:54 -0700 Subject: [PATCH 001/292] Resend diff D23858329 (#45315) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45315 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45314 in D23858329 (https://github.com/pytorch/pytorch/commit/721cfbf8425cf2c1dc5e27d1332e32e1a42ef541), we put PriorCorrectionCalibrationPrediction unit test in OSS file which causes test failure issue in public trunk. this diff moves it to FB only test file. Test Plan: ``` buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_gather_ranges_to_dense_op buck test //caffe2/caffe2/fb/python/operator_test:torch_integration_test -- test_prior_correct_calibration_prediction_op ``` all pass. Reviewed By: houseroad Differential Revision: D23899012 fbshipit-source-id: 1ed97d8702e2765991e6caf5695d4c49353dae82 --- caffe2/operators/gather_ranges_to_dense_op.cc | 8 ++++ caffe2/operators/gather_ranges_to_dense_op.h | 3 ++ .../operator_test/torch_integration_test.py | 41 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc index 10396aafc97e..aa31ef12b36a 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.cc +++ b/caffe2/operators/gather_ranges_to_dense_op.cc @@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense); } // namespace } // namespace caffe2 + +using GatherRangesToDenseCPUOp = + caffe2::GatherRangesToDenseOp; + +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( + GatherRangesToDense, + "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs", + GatherRangesToDenseCPUOp); diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index c1dd5a527005..217a61b25129 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -5,6 +5,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" +#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -15,6 +16,8 @@ #include #include +C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense); + namespace caffe2 { template class GatherRangesToDenseOp final : public Operator { diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index 55f26a89987f..9bec64764240 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -875,6 +875,47 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries): ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) + def test_gather_ranges_to_dense_op(self): + data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) + ranges = np.array([[[2, 4]], [[0, 0]]]) + key = np.array([0, 1, 3, 2, 1, 0, 1, 0]) + lengths = np.array([4]) + min_observation = 2 + max_mismatched_ratio = 0.5 + max_empty_ratio = 1.0 + + outputs_name = ["X_{}".format(i) for i in range(len(lengths))] + ref_op = core.CreateOperator( + "GatherRangesToDense", + ["data", "ranges", "key"], + outputs_name, + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + workspace.FeedBlob("data", data) + workspace.FeedBlob("ranges", ranges) + workspace.FeedBlob("key", key) + workspace.RunOperatorOnce(ref_op) + ref_outputs = [] + for output_name in outputs_name: + ref_outputs.append(workspace.FetchBlob(output_name)) + + outputs = torch.ops._caffe2.GatherRangesToDense( + torch.from_numpy(data), + torch.from_numpy(ranges), + torch.from_numpy(key), + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + + self.assertEqual(len(ref_outputs), len(outputs)) + for i in range(0, len(ref_outputs)): + np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy()) + @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10)) @settings(deadline=1000) def test_merge_id_lists(self, lengths_0, lengths_1): From 0f2c648c970d33fe7cc6a8198e9ce59a584ae734 Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Thu, 24 Sep 2020 20:06:42 -0700 Subject: [PATCH 002/292] log metadata when model loading failed (#44430) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44430 log metadata even when model loading is failed Test Plan: {F331550976} Reviewed By: husthyc Differential Revision: D23577711 fbshipit-source-id: 0504e75625f377269f1e5df0f1ebe34b8e564c4b --- torch/csrc/jit/mobile/import.cpp | 19 +++++++++++++++---- torch/csrc/jit/mobile/observer.h | 3 +++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index e812fd978c9f..e26177605674 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -228,6 +228,8 @@ class BytecodeDeserializer final { public: explicit BytecodeDeserializer(std::unique_ptr reader); mobile::Module deserialize(c10::optional device); + std::unordered_map deserializeMetadata( + c10::optional device); private: c10::IValue readArchive( @@ -246,6 +248,13 @@ BytecodeDeserializer::BytecodeDeserializer( : compilation_unit_(std::make_shared()), reader_(std::move(reader)) {} +std::unordered_map BytecodeDeserializer:: + deserializeMetadata(c10::optional device) { + device_ = device; + auto mcu = std::make_shared(); + return readMobileMetadata(mcu); +} + mobile::Module BytecodeDeserializer::deserialize( c10::optional device) { device_ = device; @@ -397,9 +406,9 @@ mobile::Module _load_for_mobile( if (observer) { observer->onEnterLoadModel(); } + auto reader = torch::make_unique(std::move(rai)); + BytecodeDeserializer deserializer(std::move(reader)); try { - auto reader = torch::make_unique(std::move(rai)); - BytecodeDeserializer deserializer(std::move(reader)); mobile::Module result = deserializer.deserialize(std::move(device)); std::unordered_map copied_metadata = result.metadata(); @@ -412,7 +421,8 @@ mobile::Module _load_for_mobile( return result; } catch (c10::Error& error) { if (observer) { - observer->onFailLoadModel(error.what()); + observer->onFailLoadModel( + error.what(), deserializer.deserializeMetadata(std::move(device))); } TORCH_RETHROW(error); } catch (...) { @@ -429,7 +439,8 @@ mobile::Module _load_for_mobile( } } catch (c10::Error& error) { if (observer) { - observer->onFailLoadModel(error.what()); + observer->onFailLoadModel( + error.what(), deserializer.deserializeMetadata(std::move(device))); } TORCH_RETHROW(error); } diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h index fde99f501f72..2935fa078fc7 100644 --- a/torch/csrc/jit/mobile/observer.h +++ b/torch/csrc/jit/mobile/observer.h @@ -78,6 +78,9 @@ class MobileModuleObserver { virtual void onExitLoadModel( const std::unordered_map&) {} virtual void onFailLoadModel(const char*) {} + virtual void onFailLoadModel( + const char*, + const std::unordered_map&) {} }; class MobileObserverConfig { From 7e5492e1bedef05752f8c8961d8bcc1a7e5f641e Mon Sep 17 00:00:00 2001 From: Xiao Wang <24860335+xwang233@users.noreply.github.com> Date: Thu, 24 Sep 2020 20:09:47 -0700 Subject: [PATCH 003/292] [minor] Fix undefined variable (#45246) Summary: The commit https://github.com/pytorch/pytorch/commit/2a37f3fd2f74e2d10f3440e6dfef2d5389caab62 https://github.com/pytorch/pytorch/pull/45130 deleted the python variable `capability` which is used in later lines. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45246 Reviewed By: walterddr Differential Revision: D23923916 Pulled By: malfet fbshipit-source-id: c5d7fef9e4a87ccc621191200e5965710e9d6aaa --- torch/cuda/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index e8687cad17e8..1176c6ee3060 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -100,6 +100,7 @@ def _check_cubins(): supported = any([sm // 10 == cap_major for sm in supported_sm]) if not supported: device_name = get_device_name(idx) + capability = cap_major * 10 + cap_minor warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name)) From 630bd85aae958495682fb5959f5a97832c2223d7 Mon Sep 17 00:00:00 2001 From: Jiakai Liu Date: Thu, 24 Sep 2020 20:15:31 -0700 Subject: [PATCH 004/292] [pytorch] refine dispatch keys in native_functions.yaml (2/N) (#45284) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45284 This is the 2nd batch of the change described in #45010. In this batch we relaxed some filters to cover more 'backend specific' ops: * ops that not call any 'Tensor::is_xxx()' method OR only call 'Tensor::is_cuda()' - we are adding CUDA dispatch key anyway; * ops that call other ATen ops but ARE differentiable - differentiability is a fuzzy indicator of not being 'composite'; Inherited other filters from the 1st batch: * These ops don't already have dispatch section in native_functions.yaml; * These ops call one or more DispatchStub (thus "backend specific"); Differential Revision: D23909901 Test Plan: Imported from OSS Reviewed By: ailzhang Pulled By: ljk53 fbshipit-source-id: 3b31e176324b6ac814acee0b0f80d18443bd81a1 --- aten/src/ATen/native/native_functions.yaml | 148 +++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index f5bbb263ed9c..0d5582572d6e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -226,6 +226,8 @@ variants: function, method - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: abs_out # Note [Adding an alias] # To add an alias do the following: @@ -268,6 +270,8 @@ variants: function, method - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -285,6 +289,8 @@ variants: method - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sgn_out - func: real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -425,8 +431,12 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: all - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -440,8 +450,12 @@ - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: any - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -688,9 +702,13 @@ - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method @@ -900,6 +918,8 @@ variants: function, method - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor use_c10_dispatcher: full @@ -910,6 +930,8 @@ variants: function, method - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor use_c10_dispatcher: full @@ -920,6 +942,8 @@ variants: function, method - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_min_out # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor @@ -1811,6 +1835,8 @@ - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function, method + dispatch: + CPU, CUDA: index # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef indices) @@ -1843,6 +1869,8 @@ - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: _index_put_impl_ - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor use_c10_dispatcher: full @@ -2142,6 +2170,8 @@ - func: matrix_exp(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor use_c10_dispatcher: full @@ -2171,6 +2201,8 @@ variants: function, method - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: max_out - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2187,6 +2219,8 @@ variants: function, method - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amax_out # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) @@ -2258,6 +2292,8 @@ variants: function, method - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2269,6 +2305,8 @@ variants: function, method - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amin_out - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor use_c10_dispatcher: full @@ -2584,18 +2622,26 @@ - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full @@ -2899,10 +2945,14 @@ - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3191,27 +3241,39 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sum_out - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor use_c10_dispatcher: full @@ -3241,23 +3303,33 @@ - func: std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3267,12 +3339,18 @@ - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -3428,6 +3506,8 @@ variants: function, method - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: trunc_out # Alias for trunc - func: fix(Tensor self) -> Tensor @@ -3506,12 +3586,18 @@ - func: var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3521,10 +3607,14 @@ - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function @@ -3560,6 +3650,8 @@ - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor use_c10_dispatcher: full @@ -3720,8 +3812,12 @@ variants: function, method - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method @@ -3830,6 +3926,8 @@ - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: rsub - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -4279,6 +4377,8 @@ - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4287,6 +4387,8 @@ - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4295,6 +4397,8 @@ - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4303,6 +4407,8 @@ - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4999,6 +5105,8 @@ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method @@ -5037,10 +5145,14 @@ device_guard: False - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: cross - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5711,6 +5823,8 @@ - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: digamma - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5782,6 +5896,8 @@ - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: atan2 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5906,8 +6022,12 @@ - func: maximum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: maximum - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: maximum_out # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max @@ -5920,8 +6040,12 @@ - func: minimum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: minimum - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: minimum_out # binary min, alias for minimum # NOTE: min is not an alias for minimum, since there is also unary min @@ -6002,6 +6126,8 @@ - func: all(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: all - func: any(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6077,18 +6203,32 @@ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: normal_ - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -6396,10 +6536,14 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6562,6 +6706,8 @@ - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss - func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7603,6 +7749,8 @@ - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: sigmoid_backward - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn From c6500bcf1494aadf7bd86adb554fdad376b7f105 Mon Sep 17 00:00:00 2001 From: Yanli Zhao Date: Thu, 24 Sep 2020 20:52:17 -0700 Subject: [PATCH 005/292] [reland] Make grad point to bucket buffer in DDP to save memory usage (#44344) Summary: [test all] Pull Request resolved: https://github.com/pytorch/pytorch/pull/44344 reland #41954 Add one argument in DDP API to enable/disable letting grads pointing to views. When it is disabled, behavior is the same as DDP right now; when it is enabled, Make both variable.grad() and grad in distautograd context point to bucket buffer in DDP to save memory usage. In this case, grad will be view of bucket buffer tensors, in order to make it compatiable with optimizer.zero_grad(), we made changes in #41283. Also be noted that we can not make variable.grad() pointing to bucket buffer during construction time, because we want to keep grad undefined for unused parameters. ghstack-source-id: 112845787 Test Plan: 1. When grad_is_view=false: a. roberta_base, peak memory usage 8250MB, p50 per iteration latency 0.923second, https://www.internalfb.com/intern/fblearner/details/218029699/?notif_channel=cli b. resnet, peak memory usage 3089MB, p50 per iteration latency 0.120second, https://www.internalfb.com/intern/fblearner/details/218029035/?notif_channel=cli c. accuracy benchmark, distributed=false, .accuracy 40.914535522461, .loss: 1.6370717287064; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588 https://www.internalfb.com/intern/fblearner/details/218035688/?notif_channel=cli d. classy vision uru production flow, https://www.internalfb.com/intern/fblearner/details/219065811/?notif_channel=cli e. pytext flow, https://www.internalfb.com/intern/fblearner/details/219137458/?notif_channel=cli 2. When grad_is_view=true: a. roberta_base, peak memory usage 7183MB, p50 per iteration latency 0.908second, https://www.internalfb.com/intern/fblearner/details/217882539?tab=operator_details b. resnet, peak memory usage 2988 MB, p50 per iteration latency 0.119second, https://www.internalfb.com/intern/fblearner/details/218028479/?notif_channel=cli c. accuracy benchmark, distributed=false, .accuracy 41.713260650635, .loss: 1.69939661026; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588, https://www.internalfb.com/intern/fblearner/details/218037058/?notif_channel=cli d. classy vision uru production flow, expected, can not work well with apex.amp https://www.internalfb.com/intern/fblearner/details/219205218/?notif_channel=cli e. pytext flow, detach_() related error, expected, as pytext zero_grad depends on apex repo where detach_() is called. also seeing the warning in finalize_bucket_dense due to tied weights, which is expected. https://www.internalfb.com/intern/fblearner/details/219150229/?notif_channel=cli Reviewed By: mrshenli Differential Revision: D23588186 fbshipit-source-id: f724d325b954ef6f06ede31759bf01dd29a6f5e5 --- test/distributed/test_c10d.py | 180 +++++++++---- torch/csrc/autograd/VariableTypeManual.cpp | 7 +- .../csrc/autograd/functions/accumulate_grad.h | 5 + torch/csrc/distributed/c10d/init.cpp | 2 + torch/csrc/distributed/c10d/reducer.cpp | 239 +++++++++++++----- torch/csrc/distributed/c10d/reducer.h | 17 +- torch/nn/parallel/distributed.py | 29 ++- .../_internal/distributed/distributed_test.py | 66 ++++- 8 files changed, 425 insertions(+), 120 deletions(-) diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 64e255fce3e6..a81bc53f175a 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -1974,13 +1974,15 @@ def tearDown(self): def world_size(self): return 2 - def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_single_device_module( + self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): model = Net() ddp_model = DistributedDataParallel( copy.deepcopy(model).to(devices[0]), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) model.to(devices[0]) @@ -1989,7 +1991,7 @@ def _prepare_single_device_module(self, process_group, devices, device_ids, glob return model, ddp_model, input, target - def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): self.assertTrue( len(devices) == 2 or len(devices) == 4, "unexpected devices for ddp tests {}".format(devices)) @@ -2002,14 +2004,15 @@ def _prepare_multi_device_module(self, process_group, devices, device_ids, globa copy.deepcopy(model), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) input = torch.randn(global_batch_size, 2).cuda(devices[0]) target = torch.randn(global_batch_size, 4) return model, ddp_model, input, target - def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False): + def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): """ Note: we pass down `device_ids` all the way to DistributedDataParallel as part of the test. Below you find tests that either use a list of @@ -2023,11 +2026,11 @@ def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi if multi_device: model, ddp_model, input, target = \ self._prepare_multi_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) else: model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2062,17 +2065,21 @@ def update_parameters(model): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] - def _test_gloo_backend(self, devices, device_ids, multi_device=False): + def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_gloo() def test_gloo_backend_cpu_module(self): self._test_gloo_backend([torch.device("cpu")], []) + @requires_gloo() + def test_gloo_backend_cpu_module_grad_is_view(self): + self._test_gloo_backend([torch.device("cpu")], [], gradient_as_bucket_view=True) + @requires_gloo() @skip_if_not_multigpu def test_gloo_backend_1gpu_module_device_ids_integer_list(self): @@ -2101,10 +2108,10 @@ def test_gloo_backend_4gpu_module(self): devices = [torch.device("cuda:" + str(i)) for i in int_devices] self._test_gloo_backend(devices, [], multi_device=True) - def _test_nccl_backend(self, devices, device_ids, multi_device=False): + def _test_nccl_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_nccl() @skip_if_not_multigpu @@ -2169,10 +2176,7 @@ def test_ddp_multi_device_module_config(self): ddp_model = DistributedDataParallel( model, device_ids=gpus, process_group=process_group) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_fp16(self): + def _test_fp16(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2184,6 +2188,7 @@ def test_fp16(self): device_ids=[gpus[0]], process_group=process_group, bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view ) # Input 2**15, so that the gradients will overflow with a @@ -2204,7 +2209,16 @@ def test_fp16(self): @requires_nccl() @skip_if_not_multigpu @skip_if_rocm - def test_arbitrary_forward_return_value(self): + def test_fp16(self): + self._test_fp16() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_fp16_grad_is_view(self): + self._test_fp16(gradient_as_bucket_view=True) + + def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2240,6 +2254,7 @@ def forward(self, x, fn): ForwardReturnValueModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2295,7 +2310,16 @@ def test(box, unbox): @requires_nccl() @skip_if_not_multigpu @skip_if_rocm - def test_find_unused_parameters_kwarg(self): + def test_arbitrary_forward_return_value(self): + self._test_arbitrary_forward_return_value() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_arbitrary_forward_return_value_grad_is_view(self): + self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) + + def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2325,12 +2349,13 @@ def forward(self, x): input = torch.rand([batch_size, 2], dtype=torch.float) target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id) - def test_find_unused_parameters(find_unused_parameters, test_default=False): + def test_find_unused_parameters(find_unused_parameters, test_default=False, gradient_as_bucket_view=False): if test_default: model = DistributedDataParallel( FindUnusedParametersModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) else: model = DistributedDataParallel( @@ -2338,6 +2363,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): device_ids=[device_id], process_group=process_group, find_unused_parameters=find_unused_parameters, + gradient_as_bucket_view=gradient_as_bucket_view, ) output, fc3 = model(input) @@ -2349,7 +2375,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # trigger an error when `backward` is called (because fc3 is an unused # parameter and will therefore be marked ready twice). try: - test_find_unused_parameters(True) + test_find_unused_parameters(True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.assertTrue( str(ex).startswith("Expected to mark a variable ready only once.")) @@ -2359,19 +2385,29 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # Then test that the default behavior can be overridden by setting # `find_unused_parameters=False`. try: - test_find_unused_parameters(False) + test_find_unused_parameters(False, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) # Test find_unused_parameters defaults to False try: - test_find_unused_parameters(True, test_default=True) + test_find_unused_parameters(True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_global_local_unused_params_grad(self): + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg_grad_is_view(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False): """ By simulating a multi-task training, this test is to make sure: 1) DDP does not touch the grad of globally unused parameters. @@ -2417,6 +2453,7 @@ def run_and_verify_grad(model): GlobalLocalUnusedParamModule().cpu(), process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(cpu_model) @@ -2427,9 +2464,20 @@ def run_and_verify_grad(model): device_ids=[device_id], process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(gpu_model) + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad(self): + self._test_global_local_unused_params_grad() + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad_with_grad_is_view(self): + self._test_global_local_unused_params_grad(gradient_as_bucket_view=True) + @requires_gloo() @skip_if_lt_x_gpu(2) def test_find_unused_parameters_when_unused_parameters_empty(self): @@ -2486,10 +2534,7 @@ def run_and_verify_grad(model): ) run_and_verify_grad(gpu_model) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_multiple_outputs_multiple_backward(self): + def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2523,6 +2568,7 @@ def forward(self, x): MultipleOutputModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2537,6 +2583,18 @@ def forward(self, x): loss2 = criterion(output2, target) loss2.backward() + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward(self): + self._test_multiple_outputs_multiple_backward() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward_grad_is_view(self): + self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_not_multigpu @skip_if_rocm @@ -2586,7 +2644,7 @@ def check_no_grads(): # No parameter should have their gradient set. check_no_grads() - def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): + def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False): """ This is the recommended way to implement accumulate grads. If ``ddp_comm_hook`` input was specified, it will also register that hook @@ -2601,7 +2659,7 @@ def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): local_batch_size = len(devices) model, ddp_model, input, target = self._prepare_single_device_module( - process_group, devices, devices, global_batch_size + process_group, devices, devices, global_batch_size, gradient_as_bucket_view ) if ddp_comm_hook is not None: @@ -2658,6 +2716,15 @@ def test_accumulate_gradients_no_sync(self): """ self._test_accumulate_gradients_no_sync() + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_no_sync_grad_is_view(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs + """ + self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_not_multigpu @skip_if_rocm @@ -2708,10 +2775,7 @@ def div(fut): num_iters=4, ddp_comm_hook=allreduce_with_then_hook ) - @requires_nccl() - @skip_if_not_multigpu - @skip_if_rocm - def test_accumulate_gradients_module(self): + def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): # This is NOT the recommended way to implement accumulating grads, but # we would like to make sure DDP does not mess up with the underlying # module. @@ -2723,7 +2787,7 @@ def test_accumulate_gradients_module(self): model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, devices, global_batch_size) + process_group, devices, devices, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2763,6 +2827,18 @@ def step_model(model, input, target): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module(self): + self._test_accumulate_gradients_module() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module_with_grad_is_view(self): + self._test_accumulate_gradients_module(gradient_as_bucket_view=True) + @requires_gloo() def test_ignored_output(self): """ @@ -3022,8 +3098,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): ddp_parameter = next(ddp_model.parameters()) self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) - @requires_gloo() - def test_sparse_gradients(self): + def _test_sparse_gradients(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) @@ -3034,10 +3109,19 @@ def test_sparse_gradients(self): ddp_model = DistributedDataParallel( copy.deepcopy(vanilla_model), process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) + @requires_gloo() + def test_sparse_gradients(self): + self._test_sparse_gradients() + + @requires_gloo() + def test_sparse_gradients_grad_is_view(self): + self._test_sparse_gradients(gradient_as_bucket_view=True) + def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3206,12 +3290,13 @@ def test_ddp_comm_hook_future_passing_cpu(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2)) - def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None): + def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False): device_id = gpus_for_rank(self.world_size)[self.rank][0] gpu_model = DistributedDataParallel( ModuleForDdpCommHook().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) # Register DDP Communication Hook if defined @@ -3276,10 +3361,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) - @requires_nccl() - @skip_if_lt_x_gpu(2) - @skip_if_rocm - def test_ddp_comm_hook_allreduce_hook_nccl(self): + def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False): """ This unit test verifies whether a DDP communication hook that just calls allreduce gives the same result result with the case of no hook registered. @@ -3294,11 +3376,23 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future: return process_group.allreduce(tensors).get_future() # Get GPU model with allreduce_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook) + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook, gradient_as_bucket_view) # check whether the grads are equal to what DDP without hook would return. self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl(self): + self._test_ddp_comm_hook_allreduce_hook_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self): + self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_lt_x_gpu(2) @skip_if_rocm diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index c72c67eb5230..18e5e4f54820 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -269,7 +269,12 @@ Tensor & detach_(Tensor & self) { "of detach_(). Alternatively, create this view with an " "`unsafe_` version of the function that produced it."); } else { - AT_ERROR("Can't detach views in-place. Use detach() instead"); + AT_ERROR("If you are using DistributedDataParallel (DDP) for training, " + "and gradient_as_bucket_view is set as True, gradients are " + "views of DDP buckets, and hence detach_() cannot be called " + "on these gradients. To fix this error, please refer to the " + "Optimizer.zero_grad() function in torch/optim/optimizer.py " + "as the solution."); } } // I think the choice here is conservative. In principle, doing diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index e1a02dc19fd8..dafd07f64b84 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -161,6 +161,11 @@ struct TORCH_API AccumulateGrad : public Node { // valid operation which adds `new_grad` to `variable_grad` in // place. `variable_grad` is thus still referring to the same tensor // after the operation. + // Also DistributedDataParallel(DDP) package relies on grad being + // mutated in place for saving peak memory usage. DDP will still + // work correctly if it is mutated out of place here, but DDP will + // maintain one extra copy of grad tensors in buffer and thus + // increase peak memory usage. variable_grad += new_grad; CHECK_RESULT(variable_grad, variable); // ^ We could enforce the contract more aggressively here by writing: diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index aff2da31c133..165d6a1c8603 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -159,6 +159,7 @@ PyObject* c10d_init(PyObject* _unused) { std::shared_ptr<::c10d::ProcessGroup>, std::vector>, int64_t, + bool, bool>(), py::arg("replicas"), py::arg("bucket_indices"), @@ -166,6 +167,7 @@ PyObject* c10d_init(PyObject* _unused) { py::arg("expect_sparse_gradients") = std::vector>(), py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap, py::arg("find_unused_parameters") = false, + py::arg("gradient_as_bucket_view") = false, py::call_guard()) .def( "initialize_buckets", diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 1a5766eea84e..86916c7994dd 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -32,7 +32,8 @@ Reducer::Reducer( std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters) + bool find_unused_parameters, + bool gradient_as_bucket_view) : replicas_(std::move(replicas)), process_group_(std::move(process_group)), expect_sparse_gradients_(std::move(expect_sparse_gradients)), @@ -41,6 +42,7 @@ Reducer::Reducer( next_bucket_(0), has_marked_unused_parameters_(false), find_unused_parameters_(find_unused_parameters), + gradient_as_bucket_view_(gradient_as_bucket_view), local_used_maps_reduced_(false), backward_stats_base_(0), has_rebuilt_bucket_(false), @@ -310,6 +312,56 @@ void Reducer::verify_replica0_across_processes() { } } +void Reducer::check_grad_layout( + const at::Tensor& grad, + const at::Tensor& bucket_view) { + // Ensure that the gradient type matches the bucket type. + TORCH_CHECK( + grad.options().type_equal(bucket_view.options()), + "Expected ", + bucket_view.toString(), + ", got ", + grad.toString()); + TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); + TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); + // AccumulateGrad doesn't HAVE to obey the grad layout contract. + // The penalty for disobedience is reduced performance, not numerical + // death. Warnings here help diagnose poor DDP performance. + if (grad.strides() != bucket_view.strides()) { + TORCH_WARN_ONCE( + "Grad strides do not match bucket view strides. " + "This may indicate grad was not created according to the " + "gradient layout contract, or that the param's strides " + "changed since DDP was constructed. This is not an error, " + "but may impair performance.\n" + "grad.sizes() = ", + grad.sizes(), + ", strides() = ", + grad.strides(), + "\n", + "bucket_view.sizes() = ", + bucket_view.sizes(), + ", strides() = ", + bucket_view.strides()); + } + if (!gradient_as_bucket_view_) { + TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); + } +} + +void Reducer::copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view) { + // See Note [DDP Communication Hook] + if (comm_hook_ == nullptr) { + // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp + auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); + wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); + // Divides while copying into the bucket view. + at::native::mul_out(bucket_view, grad, wrapped); + } else { + bucket_view.copy_(grad); + } +} + void Reducer::mark_variable_ready_dense(VariableIndex index) { const auto replica_index = index.replica_index; const auto variable_index = index.variable_index; @@ -327,49 +379,27 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // of the bucket it would otherwise hold. runGradCallbackForVariable(variable, [&](auto& grad) { if (grad.defined()) { - // Ensure that the gradient type matches the bucket type. - TORCH_CHECK( - grad.options().type_equal(bucket_view.options()), - "Expected ", - bucket_view.toString(), - ", got ", - grad.toString()); - // Assert that the grad tensor and the bucket don't share storage. - // If they did, we could avoid the copy altogether. - // The reason for not doing this is that existing code calls - // `detach_` from `zero_grad`, which is incompatible with views. - TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); - TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); - TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); - // AccumulateGrad doesn't HAVE to obey the grad layout contract. - // The penalty for disobedience is reduced performance, not numerical - // death. Warnings here help diagnose poor DDP performance. - if (grad.strides() != bucket_view.strides()) { - TORCH_WARN_ONCE( - "Grad strides do not match bucket view strides. " - "This may indicate grad was not created according to the " - "gradient layout contract, or that the param's strides " - "changed since DDP was constructed. This is not an error, " - "but may impair performance.\n" - "grad.sizes() = ", - grad.sizes(), - ", strides() = ", - grad.strides(), - "\n", - "bucket_view.sizes() = ", - bucket_view.sizes(), - ", strides() = ", - bucket_view.strides()); - } - // See Note [DDP Communication Hook] - if (comm_hook_ == nullptr) { - // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp - auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); - wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); - // Divides while copying into the bucket view. - at::native::mul_out(bucket_view, grad, wrapped); + this->check_grad_layout(grad, bucket_view); + // When gradient_as_bucket_view_ is false, or even when + // gradient_as_bucket_view_ is true, in rare cases users may set grad to + // be None after every iteration. In these cases, grad and bucket_view are + // pointing to different storages and thus need to copy grads to + // bucket_view. If gradient_as_bucket_view_ is set as true, let grad point + // to bucket_view. If grad has already been set as views of buckets in + // previous iterations, no copy is needed. + if (!grad.is_alias_of(bucket_view)) { + this->copy_grad_to_bucket(grad, bucket_view); + if (gradient_as_bucket_view_) { + // Let grad point to bucket_view buffer. + grad = bucket_view; + // The grad is modified and need to be written back. + return true; + } } else { - bucket_view.copy_(grad); + // If grad and bucket view point to the same storage, no need to copy + if (comm_hook_ == nullptr) { + bucket_view.div_(divFactor_); + } } } else { bucket_view.zero_(); @@ -674,6 +704,17 @@ void Reducer::mark_bucket_ready(size_t bucket_index) { void Reducer::initialize_buckets( std::vector> bucket_indices) { + // If initialize_buckets is called inside DDP constructor, then + // it does not matter rpc context ptr is nullptr or not, as grad + // will not be mutated. + // If initialize_buckets is called during training loop, e.g, inside + // rebuild_buckets(), since grad could be mutated and be pointed to + // bucket_view, then it needs to check rpc context ptr is nullptr or not, + // If rpc context ptr is nullptr, mutate variable.grad(); otherwise, + // mutate grad in rpc context. + using torch::distributed::autograd::ThreadLocalDistAutogradContext; + this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr()); + // This shouldn't be called if we're expecting autograd hooks to fire. TORCH_CHECK( !expect_autograd_hooks_, @@ -825,7 +866,7 @@ void Reducer::initialize_bucket_views( Reducer::BucketReplica& replica, at::Tensor& contents) { for (size_t i = 0; i < replica.variables.size(); i++) { - const auto& v = replica.variables[i]; + auto& v = replica.variables[i]; const auto offset = replica.offsets[i]; const auto length = replica.lengths[i]; if (v.is_non_overlapping_and_dense()) { @@ -844,6 +885,29 @@ void Reducer::initialize_bucket_views( // By default `bucket_views_out` and `bucket_views_in` are // essentially the same thing. replica.bucket_views_out = replica.bucket_views_in; + + // If gradient_as_bucket_view_ is set as true, then there are two cases to + // handle: initialize_bucket_views could be called inside initialize_buckets + // when rebuild_buckets, if grad has already been defined/calculated in + // previous iteration, old grad needs to be copied into new bucket_view and + // let grad point to the new bucket_view, initialize_bucket_views could also + // be called inside initialize_buckets during construction. Grads are not + // defined during construction time, in this case, do not let grad point to + // bucket_view, because grads should be kept as being undefined for globally + // unused parameters. + if (gradient_as_bucket_view_) { + auto& bucket_view = replica.bucket_views_in.back(); + runGradCallbackForVariable(v, [&](auto& grad) { + if (grad.defined() && !grad.is_alias_of(bucket_view)) { + bucket_view.copy_(grad); + grad = bucket_view; + // The grad is modefied and needs to be written back. + return true; + } + // The grad is not modified and does not need to be written back. + return false; + }); + } } } @@ -965,6 +1029,31 @@ void Reducer::prepare_for_backward( } } +void Reducer::copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused) { + const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + if (!grad.defined()) { + // Creates grad according to the "Gradient Layout Contract" + // (see torch/csrc/grad/AccumulateGrad.h) + grad = + torch::autograd::utils::clone_obey_contract(bucket_view, variable); + } else { + grad.copy_(bucket_view); + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); +} + // A bucket with one or more dense tensors needs to be unflattened. void Reducer::finalize_bucket_dense(Bucket& bucket) { for (size_t replica_index = 0; replica_index < bucket.replicas.size(); @@ -1015,24 +1104,52 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) { } } - const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; - runGradCallbackForVariable(variable, [&](auto& grad) { - // If a parameter is globally unused, we keep its grad untouched. - if (!global_unused) { - if (!grad.defined()) { - // Creates grad according to the "Gradient Layout Contract" - // (see torch/csrc/grad/AccumulateGrad.h) - grad = torch::autograd::utils::clone_obey_contract( - bucket_view, variable); - } else { - grad.copy_(bucket_view); - } - // The grad is modified and needs to be written back. - return true; + if (!gradient_as_bucket_view_) { + copy_bucket_to_grad( + variable, replica, intra_bucket_index, global_unused); + } else { + const auto& bucket_view_out = + replica.bucket_views_out[intra_bucket_index]; + auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index]; + // If communication_hook is registered, bucket_view_out stores + // allreduced results in a newly allocated tensor, copy bucket_view_out + // back to bucket_view_in that referring to replica.content tensor and + // grad. + if (!bucket_view_in.is_alias_of(bucket_view_out)) { + bucket_view_in.copy_(bucket_view_out); } - // The grad is not modified. - return false; - }); + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + // If grad is globally used but locally unused, let grad point to + // bucket_view_in + if (!grad.defined()) { + grad = bucket_view_in; + } else { + if (!grad.is_alias_of(bucket_view_in)) { + grad.copy_(bucket_view_in); + TORCH_WARN_ONCE( + "Detected at least one parameter gradient is not the " + "expected DDP bucket view when setting " + "gradient_as_bucket_view=True. This can happen when " + "multiple parameters sharing the same gradient. For " + "example, param0 and param1 share the same gradient " + "grad0. In this case, grad0 would first point to " + "bucket_view_in0 when param0 is ready. Later, when " + "param1 is ready, it will override grad0 to point to " + "bucket_view_in1. However, param0 still expects grad0 " + "to point to bucket_view_in0, and hence hit this " + "warning. If you saw this message, please double-check if " + "the above situation is expected for your application."); + } + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); + } } } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 3b441c99a3b6..960a32356acf 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -30,7 +30,8 @@ class Reducer { std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters); + bool find_unused_parameters, + bool gradient_as_bucket_view); ~Reducer() noexcept(false); @@ -124,6 +125,7 @@ class Reducer { bool has_marked_unused_parameters_; const bool find_unused_parameters_; + const bool gradient_as_bucket_view_; std::vector unused_parameters_; // Locally used parameter maps indicating if parameters are used locally // during the current iteration or no_sync session if no_sync is on. One @@ -230,6 +232,19 @@ class Reducer { // with the result of `future_work`. void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor); + // If gradient_as_bucket_view_ is false, after allreduce buckets, + // copy bucket results back to grads. + void copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused); + // Check layout of grad and bucket_view before calling copy_grad_to_bucket + void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view); + // If gradient_as_bucket_view_ is false, before allreduce buckets, + // copy grads to buckets. + void copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view); + // A bucket holds N bucket replicas (1 per model replica). // // If every bucket in this struct is ready, the reduction can be kicked off. diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 790a9d1c2fc4..5ec2b0148a21 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -316,6 +316,28 @@ class DistributedDataParallel(Module): are getting different gradients, which should not happen if DistributedDataParallel is correctly used. (default: ``False``) + gradient_as_bucket_view (bool): this is a prototype feature. When set to ``True``, + gradients will be views pointing to different offsets of + allreduce communication buckets. This can reduce peak memory + usage, where the saved memory size will be equal to the total + gradients size. Moreover, it avoids the overhead of copying + between gradients and allreduce communication buckets. + When gradients are views, ``detach_()`` cannot be called on the + gradients. If hitting such errors, please fix it by referring to + the :meth:`~torch.optim.Optimizer.zero_grad` function in + ``torch/optim/optimizer.py`` as the solution. + Warning! It is also found that ``gradient_as_bucket_view = true`` + does not work as expected when ``apex.amp`` is used for + mixed precision training. ``apex.amp`` maintained stashed gradients + that are used for unscaling gradients. These stashed gradients + are pointed to gradients (will be communication buckets when + ``gradient_as_bucket_view = true``) before starting new iteration. + In new iteration, the communication buckets are mutated and thus + these stashed gradients will be unexpectedly mutated as well, + the unexpectedly muated stashed gradients may result in wrong + results. To fix it, these stashed gradients should not be pointed + to gradients, instead they should be copied from gradients when + ``gradient_as_bucket_view = true``. Attributes: module (Module): the module to be parallelized @@ -330,7 +352,8 @@ def __init__(self, module, device_ids=None, process_group=None, bucket_cap_mb=25, find_unused_parameters=False, - check_reduction=False): + check_reduction=False, + gradient_as_bucket_view=False): super(DistributedDataParallel, self).__init__() @@ -381,6 +404,7 @@ def __init__(self, module, device_ids=None, self.require_backward_grad_sync = True self.require_forward_param_sync = True self.ddp_join_enabled = False + self.gradient_as_bucket_view = gradient_as_bucket_view if check_reduction: # This argument is no longer used since the reducer @@ -516,7 +540,8 @@ def produces_sparse_gradient(module): self.process_group, expect_sparse_gradient, self.bucket_bytes_cap, - self.find_unused_parameters) + self.find_unused_parameters, + self.gradient_as_bucket_view) # passing a handle to torch.nn.SyncBatchNorm layer self._passing_sync_batchnorm_handle(self._module_copies) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 85b1d65a06ec..f6f2b9a6fbfb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -2096,6 +2096,14 @@ def _model_step(self, model): param += param.grad param.grad = None + def _model_step_with_zero_grad(self, model): + for param in model.parameters(): + if param.grad is not None: + with torch.no_grad(): + param += param.grad + param.grad.requires_grad_(False) + param.grad.zero_() + def _prepare_dummy_data(self, local_bs): # global_bs for DDP should be divisible by WORLD_SIZE world_size = int(os.environ["WORLD_SIZE"]) @@ -2118,7 +2126,8 @@ def _assert_equal_param(self, param_gpu, param_DDP): self.assertEqual(p_gpu, p_DDP) def _test_DDP_5iter( - self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, offset=None, world_size=0 + self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, + offset=None, world_size=0, zero_grad=False ): for idx in range(5): # single cpu/gpu training @@ -2137,8 +2146,12 @@ def _test_DDP_5iter( ) # Update weights and run a second iteration to shake out errors - self._model_step(model_base) - self._model_step(model_DDP) + if zero_grad: + self._model_step_with_zero_grad(model_base) + self._model_step_with_zero_grad(model_DDP) + else: + self._model_step(model_base) + self._model_step(model_DDP) self._assert_equal_param( list(model_base.parameters()), list(model_DDP.module.parameters()) ) @@ -2159,7 +2172,7 @@ def _test_DDP_5iter( for k in model_DDP.state_dict(): self.assertEqual(model_DDP.state_dict()[k], saved_model.state_dict()[k]) - def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): + def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gradient_as_bucket_view=False): # Run a simple end to end DDP model, use result of single node model # as baseline @@ -2174,7 +2187,7 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) model_DDP = nn.parallel.DistributedDataParallel( - model_DDP, device_ids=gpu_subset + model_DDP, device_ids=gpu_subset, gradient_as_bucket_view=gradient_as_bucket_view ) # test serializable/unserializable @@ -2196,14 +2209,11 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): local_bs, rank, global_bs, - True + True, ) self._barrier() - @unittest.skipIf( - BACKEND == "nccl", "nccl does not support DDP on CPU models" - ) - def test_DistributedDataParallelCPU(self): + def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False): # Run a simple end to end DDP-CPU model, use result of single node # model as baseline group, group_id, rank = self._init_global_test() @@ -2213,7 +2223,8 @@ def test_DistributedDataParallelCPU(self): # DDP-CPU training setup model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) + model_DDP = nn.parallel.DistributedDataParallel( + model_DDP, gradient_as_bucket_view=gradient_as_bucket_view) # dummy data initialization local_bs = 2 @@ -2221,10 +2232,22 @@ def test_DistributedDataParallelCPU(self): # check two model parameters over 5 iterations self._test_DDP_5iter( - model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False + model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False, zero_grad=True ) self._barrier() + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DDP on CPU models" + ) + def test_DistributedDataParallelCPU(self): + self._test_DistributedDataParallelCPU() + + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DDP on CPU models" + ) + def test_DistributedDataParallelCPU_grad_is_view(self): + self._test_DistributedDataParallelCPU(gradient_as_bucket_view=True) + @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo', "Only Nccl & Gloo backend support DistributedDataParallel") def test_DistributedDataParallel_requires_grad(self): @@ -2288,6 +2311,25 @@ def test_DistributedDataParallel(self): gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda')) + @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo', + "Only Nccl & Gloo backend support DistributedDataParallel") + @skip_if_no_gpu + @skip_if_rocm + def test_DistributedDataParallel_with_grad_is_view(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + gpus = list(rank_to_GPU[rank]) + self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, gradient_as_bucket_view=True) + + # test output_device + self._test_DistributedDataParallel( + gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True) + + # test device_ids + gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) + self._test_DistributedDataParallel( + gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True) + def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs, global_bs, offset, output_device=None): # Run a simple end to end DDP model, use result of single node model # as baseline From 0122299f9ba729aa0c9bd43764af53225e03672c Mon Sep 17 00:00:00 2001 From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com> Date: Thu, 24 Sep 2020 21:12:16 -0700 Subject: [PATCH 006/292] Enable distributed package on windows, Gloo backend supported only (#42897) Summary: Fixes https://github.com/pytorch/pytorch/issues/42095 For test case part will be committed to this PR later mrshenli, please help to review Pull Request resolved: https://github.com/pytorch/pytorch/pull/42897 Reviewed By: osalpekar Differential Revision: D23841786 Pulled By: mrshenli fbshipit-source-id: 334ba1ed73eff2f668857390fc32d1bc7f08e5f3 --- .../install_miniconda3.bat | 7 +++ CMakeLists.txt | 8 ++- caffe2/CMakeLists.txt | 49 +++++++++------ cmake/Dependencies.cmake | 5 +- test/cpp/dist_autograd/CMakeLists.txt | 2 +- test/distributed/test_c10d.py | 49 ++++++++++----- test/distributed/test_c10d_spawn.py | 8 ++- test/run_test.py | 11 ++-- tools/build_variables.bzl | 7 ++- torch/CMakeLists.txt | 33 +++++----- torch/csrc/Module.cpp | 4 +- torch/csrc/WindowsTorchApiMacro.h | 6 ++ torch/csrc/distributed/c10d/comm.h | 4 +- torch/csrc/distributed/c10d/init.cpp | 10 ++- torch/csrc/distributed/c10d/reducer.cpp | 22 +++---- torch/csrc/distributed/c10d/reducer.h | 14 +++++ torch/csrc/jit/python/pybind_utils.h | 8 +-- .../csrc/jit/python/python_sugared_value.cpp | 2 +- torch/csrc/jit/runtime/interpreter.cpp | 8 +-- torch/csrc/jit/serialization/pickler.cpp | 6 +- torch/csrc/jit/serialization/unpickler.cpp | 6 +- torch/csrc/utils/future.h | 2 +- torch/distributed/rendezvous.py | 14 ++++- torch/lib/c10d/CMakeLists.txt | 32 ++++++---- torch/lib/c10d/FileStore.cpp | 51 +++++++++++++++- torch/lib/c10d/GlooDeviceFactory.cpp | 33 ++++++---- torch/lib/c10d/ProcessGroupGloo.cpp | 61 ++++++++++++++++--- torch/lib/c10d/Utils.cpp | 3 +- torch/lib/c10d/Utils.hpp | 4 ++ torch/lib/c10d/test/CMakeLists.txt | 15 +++-- torch/lib/c10d/test/CUDATest.hpp | 10 ++- torch/lib/c10d/test/FileStoreTest.cpp | 8 +++ torch/lib/c10d/test/ProcessGroupGlooTest.cpp | 9 ++- torch/lib/c10d/test/TestUtils.hpp | 30 ++++++++- torch/testing/_internal/common_distributed.py | 17 +++++- torch/testing/_internal/common_utils.py | 4 ++ torch/testing/_internal/dist_utils.py | 3 +- .../ddp_under_dist_autograd_test.py | 16 ++--- .../_internal/distributed/distributed_test.py | 48 +++++++++++---- 39 files changed, 462 insertions(+), 167 deletions(-) diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index a66ef4b651c5..cf7255ce3789 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if "%REBUILD%"=="" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 call conda install -y -q -c conda-forge cmake + call conda install -y -q -c rdonnelly libuv ) + +:: Get installed libuv path +@echo off +set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library +@echo on +echo libuv_ROOT=%libuv_ROOT% diff --git a/CMakeLists.txt b/CMakeLists.txt index 826c187b602e..3d937e0e1655 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ endif() # For non-supported platforms, turn USE_DISTRIBUTED off by default. # It is not tested and likely won't work without additional changes. -if(NOT LINUX) +if(NOT LINUX AND NOT WIN32) set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed") # On macOS, if USE_DISTRIBUTED is enabled (specified by the user), # then make Gloo build with the libuv transport. @@ -226,6 +226,12 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF) +# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected +if(WIN32) + set(USE_TENSORPIPE OFF) + message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF") +endif() + # Linux distributions do not want too many embedded sources, in that sense we # need to be able to build pytorch with an (almost) empty third_party # directory. diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 65f072b6f29d..219b28c69695 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -291,26 +291,29 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(USE_DISTRIBUTED) - add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") - target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) - add_dependencies(process_group_agent torch c10d) # Define this target even if we're building without TensorPipe, to make life # easier to other targets that depend on this. However, in that case, by not # setting the USE_TENSORPIPE compile definition, this target will just end # up being empty. Downstream targets should also add a #ifdef guard. - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch c10d) - if(USE_TENSORPIPE) - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) + if(NOT WIN32) + add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") + target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) + add_dependencies(process_group_agent torch c10d) + + add_library(tensorpipe_agent + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" + ) + target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) + add_dependencies(tensorpipe_agent torch c10d) + if(USE_TENSORPIPE) + target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) + target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) + add_dependencies(tensorpipe_agent tensorpipe) + endif() endif() endif() @@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" ) endif() - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) append_filelist("libtorch_distributed_sources" TORCH_SRCS) endif() endif() @@ -837,7 +840,7 @@ endif() if(BUILD_TEST AND NOT USE_ROCM) add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr) - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc) endif() endif() @@ -889,9 +892,7 @@ endif() DESTINATION share/cmake/Torch) if(USE_DISTRIBUTED) - if(NOT MSVC) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) - endif() + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) endif() @@ -966,6 +967,14 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED ) + # Pass USE_RPC in order to reduce use of + # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) + # need to be removed when RPC is supported + if(NOT WIN32) + target_compile_definitions(torch_cpu PRIVATE + USE_RPC + ) + endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 028098f61d36..023bbe9e8d07 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1253,10 +1253,7 @@ if(USE_CUDA) endif() if(USE_GLOO) - if(MSVC) - message(WARNING "Gloo can not be used on Windows.") - caffe2_update_option(USE_GLOO OFF) - elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt index 5d23602881f0..9969c63e16d5 100644 --- a/test/cpp/dist_autograd/CMakeLists.txt +++ b/test/cpp/dist_autograd/CMakeLists.txt @@ -1,4 +1,4 @@ -if(USE_DISTRIBUTED) +if(USE_DISTRIBUTED AND NOT WIN32) set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd") set(DIST_AUTOGRAD_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index a81bc53f175a..911a73ce432e 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,7 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests + simple_sparse_reduce_tests, skip_if_win32, create_device from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -255,6 +255,7 @@ def create_tcp_store(addr): raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports)) +@skip_if_win32() class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): store = create_tcp_store('localhost') @@ -273,6 +274,7 @@ def test_address_already_in_use(self): store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 +@skip_if_win32() class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): super(PrefixTCPStoreTest, self).setUp() @@ -329,6 +331,7 @@ def test_unknown_handler(self): c10d.rendezvous('invalid://') +@skip_if_win32() class RendezvousEnvTest(TestCase): @retry_on_connect_failures def test_common_errors(self): @@ -455,7 +458,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile(delete=False) as file: - url = 'file://%s?world_size=%d' % (file.name, 2) + url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -474,6 +477,7 @@ def test_nominal(self): self.assertEqual(b"value1", store0.get("key1")) +@skip_if_win32() class RendezvousTCPTest(TestCase): def create_tcp_url(self): @@ -544,9 +548,13 @@ def _test_store_timeout(self, backend, init_method, c2p): def _init_methods(self): f = tempfile.NamedTemporaryFile(delete=False) - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() + if sys.platform == 'win32': + yield "file:///%s" % f.name.replace("\\", "/") + f.close() + else: + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() def _test_default_store_timeout(self, backend): for init_method in self._init_methods(): @@ -584,11 +592,16 @@ def test_default_store_timeout_gloo(self): class ProcessGroupGlooTest(MultiProcessTestCase): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - self._fork_processes() + + # For Windows platform, Python does not support fork, change it to spawn here. + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + opts.devices = [create_device(interface=LOOPBACK)] opts.timeout = 5.0 opts.threads = threads return opts @@ -598,8 +611,8 @@ def test_multi_device_constructor(self): opts = c10d.ProcessGroupGloo.Options() opts.timeout = 5.0 opts.devices = [ - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), ] pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) @@ -1514,6 +1527,7 @@ def test_barrier_implies_wait(self): for i, tensor in enumerate(tensors): self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) + @skip_if_win32() def test_round_robin(self): num_process_groups = 2 store = c10d.FileStore(self.file_name, self.world_size) @@ -1531,6 +1545,7 @@ def test_round_robin(self): pg.broadcast(tensor, root=0).wait() self.assertEqual(torch.full([100, 100], 0.), tensor) + @skip_if_win32() def test_round_robin_create_destroy(self): store = c10d.FileStore(self.file_name, self.world_size) @@ -1959,7 +1974,10 @@ def forward(self, x): class DistributedDataParallelTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor @@ -2068,7 +2086,7 @@ def update_parameters(model): def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @@ -3947,7 +3965,10 @@ def test_nccl_timeout(self): class CommTest(MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): super(CommTest, self).tearDown() @@ -4013,7 +4034,7 @@ def test_broadcast_coalesced_nccl(self): def test_broadcast_coalesced_gloo_cuda(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cuda:%d" % self.rank) ranks = list(range(self.world_size)) @@ -4024,7 +4045,7 @@ def test_broadcast_coalesced_gloo_cuda(self): def test_broadcast_coalesced_gloo_cpu(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cpu") ranks = list(range(self.world_size)) diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index d0bf00b8a08a..c84608e8f178 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -10,8 +10,10 @@ import torch.nn as nn from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo -from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm +from torch.testing._internal.common_distributed import requires_gloo, \ + create_device +from torch.testing._internal.common_utils import TestCase, load_tests, \ + run_tests, skipIfRocm from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN @@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase): @classmethod def opts(cls, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")] + opts.devices = [create_device(interface='lo')] opts.timeout = 5.0 opts.threads = threads return opts diff --git a/test/run_test.py b/test/run_test.py index d63fc372f9c2..0f9d14a78605 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,7 +13,7 @@ import torch import torch._six from torch.utils import cpp_extension -from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell +from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA import torch.distributed as dist from typing import Dict, Optional @@ -99,7 +99,6 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', - 'distributed/test_distributed_spawn', ] ROCM_BLOCKLIST = [ @@ -306,9 +305,13 @@ def test_distributed(test_module, test_directory, options): 'MPI not available -- MPI backend tests will be skipped') config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): + if sys.platform == 'win32' and backend != 'gloo': + continue if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: + if sys.platform == 'win32' and not with_init_file: + continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" @@ -322,9 +325,9 @@ def test_distributed(test_module, test_directory, options): os.environ.update(env_vars) if with_init_file: if test_module in ["test_distributed_fork", "test_distributed_spawn"]: - init_method = 'file://{}/'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/' else: - init_method = 'file://{}/shared_init_file'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 174bb858da44..c21fab8ec2cf 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -537,11 +537,14 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", ] -libtorch_python_distributed_sources = [ - "torch/csrc/distributed/autograd/init.cpp", +libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", +] + +libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ + "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/rpc/init.cpp", "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index b78dc4a362a7..2ae2f7f737fe 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -160,25 +160,28 @@ endif() if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) - if(NOT MSVC) + if(WIN32) + append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) + else() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() endif() + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) + list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) endif() -if(USE_NCCL) +if(USE_NCCL AND NOT WIN32) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ed4aa21a8f76..ae6f15155f2a 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -688,9 +688,9 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#ifdef USE_DISTRIBUTED -#ifdef USE_C10D +#if defined(USE_DISTRIBUTED) && defined(USE_C10D) THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); +#ifndef _WIN32 THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions()); THPUtils_addPyMethodDefs( methods, torch::distributed::autograd::python_functions()); diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 7f8ef4e01677..7f44db0baba9 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -5,3 +5,9 @@ // There's no difference between aten, torch and caffe2 libs any more // TODO: clean up the naming for consistency #define TORCH_API CAFFE2_API + +#ifdef _WIN32 +#define TORCH_PYTHON_API +#else +#define TORCH_PYTHON_API CAFFE2_API +#endif diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h index e2b501f08aff..2eb626c40232 100644 --- a/torch/csrc/distributed/c10d/comm.h +++ b/torch/csrc/distributed/c10d/comm.h @@ -38,7 +38,7 @@ class GradBucket { // DDP's c10d reducer allows communication hooks defined as a sub class // of CommHookInterface. CommHookInterface is an abstract class and can // be used to implement both Python and CPP hooks. -struct TORCH_API CommHookInterface { +struct TORCH_PYTHON_API CommHookInterface { public: virtual ~CommHookInterface() {} @@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface { // PythonCommHook enables registering a python hook to c10d reducer and is a // sub class of CommHookInterface. -class TORCH_API PythonCommHook : public CommHookInterface { +class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { public: // The constructor takes a state and a callable hook. Inputs are Python // objects. The state is passed to the hook in runHook function can be used to diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 165d6a1c8603..be1752d7366f 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,7 +1,11 @@ #include #include +#ifndef _WIN32 #include +#include +#include +#endif #include #ifdef USE_C10D_GLOO @@ -17,8 +21,6 @@ #endif #include -#include -#include #include #include @@ -323,6 +325,7 @@ They are used in specifying strategies for reduction collectives, e.g., shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store) .def(py::init()); +#ifndef _WIN32 shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store) .def(py::init<>()); @@ -340,6 +343,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("is_master"), py::arg("timeout") = std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); +#endif shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store) .def(py::init>()); @@ -607,6 +611,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("opts") = ::c10d::BarrierOptions(), py::call_guard()); +#ifndef _WIN32 module.def( "_round_robin_process_groups", [](std::vector> processGroups) @@ -620,6 +625,7 @@ They are used in specifying strategies for reduction collectives, e.g., }, py::arg("process_groups"), py::call_guard()); +#endif #ifdef USE_C10D_GLOO auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>( diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 86916c7994dd..814d3494ff4e 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -89,10 +89,7 @@ Reducer::Reducer( for (size_t variable_index = 0; variable_index < variable_count; variable_index++) { auto& variable = replicas_[replica_index][variable_index]; - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); // The gradient accumulator function is lazily initialized once. // Therefore we can use its presence in the autograd graph as @@ -100,15 +97,19 @@ Reducer::Reducer( auto grad_accumulator = torch::autograd::impl::grad_accumulator(variable); +#ifndef _WIN32 using torch::distributed::autograd::ThreadLocalDistAutogradContext; +#endif // Hook to execute after the gradient accumulator has executed. hooks_.emplace_back( grad_accumulator->add_post_hook( torch::make_unique( [=](const torch::autograd::variable_list& outputs, const torch::autograd::variable_list& /* unused */) { +#ifndef _WIN32 this->rpc_context_.set( ThreadLocalDistAutogradContext::getContextPtr()); +#endif this->autograd_hook(index); return outputs; })), @@ -477,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() { const auto variable_count = replicas_[replica_index].size(); for (size_t variable_index = 0; variable_index < variable_count; ++variable_index) { - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); push_rebuilt_params(index); } } @@ -850,10 +848,8 @@ void Reducer::initialize_buckets( TORCH_CHECK( variable_index < variable_locators_.size(), "Out of range variable index specified."); - variable_locators_[variable_index] = VariableLocator{ - .bucket_index = bucket_index, - .intra_bucket_index = intra_bucket_index++, - }; + variable_locators_[variable_index] = VariableLocator( + bucket_index, intra_bucket_index++); } bucket.variable_indices = std::move(bucket_indices[bucket_index]); @@ -1235,7 +1231,9 @@ void Reducer::runGradCallbackForVariable( cb(variable.mutable_grad()); } else { // Under distributed autograd +#ifndef _WIN32 context_ptr->runGradCallbackForVariable(variable, std::move(cb)); +#endif } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 960a32356acf..486b7337366a 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -104,6 +104,13 @@ class Reducer { struct VariableIndex { size_t replica_index; size_t variable_index; + + VariableIndex() = default; + + VariableIndex(size_t replica_index_, size_t variable_index_) { + replica_index = replica_index_; + variable_index = variable_index_; + } }; void push_rebuilt_params(const VariableIndex& index); @@ -281,6 +288,13 @@ class Reducer { size_t bucket_index; // Index of parameter in single bucket replica. size_t intra_bucket_index; + + VariableLocator() = default; + + VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) { + bucket_index = bucket_index_; + intra_bucket_index = intra_bucket_index_; + } }; // Map the index of a variable to its location in the bucket structure. diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 65f5a49145c8..4be55a9caa90 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) { if (py::isinstance(input)) { auto object = py::cast(input); return InferredType(object.type()); -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC } else if (py::isinstance(input)) { auto rref_ivalue = input.cast().toIValue(); return InferredType(rref_ivalue.type()); @@ -716,7 +716,7 @@ inline IValue toIValue( } } case TypeKind::RRefType: { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return obj.cast().toIValue(); #else AT_ERROR("RRef is only supported with the distributed package"); @@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) { } return std::move(py_dict); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC auto RRefPtr = c10::dynamic_intrusive_pointer_cast( std::move(ivalue).toRRef()); @@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) { auto py_class = getScriptedClassOrError(qualified_class_name); return py_class.attr(enum_holder->name().c_str()); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return py::cast(torch::distributed::rpc::PyRRef( c10::static_intrusive_pointer_cast( ivalue.toRRef()))); diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index ba94d33f37b3..119b6b5e5de7 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -916,7 +916,7 @@ std::shared_ptr toSugaredValue( } else if ( obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) { return SpecialFormValue::create(prim::annotate); -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on. } else if ( obj.ptr() == diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 337fe66c0789..f61e2597447f 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -23,7 +23,7 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include using torch::distributed::autograd::DistAutogradContainer; #endif @@ -267,7 +267,7 @@ void insertLastUses(Graph& g) { } inline int64_t getDistAutogradContextId() { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return DistAutogradContainer::currentContextId(); #else return 0; @@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState( : pImpl(std::move(pImpl_)) {} void InterpreterContinuation::operator()() { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC auto prev_dist_id = DistAutogradContainer::currentContextId(); DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_); #endif @@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() { } else { state.runAsync(stack); } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC DistAutogradContainer::forceCurrentContextId(prev_dist_id); #endif } diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 6f911f4246cc..2bc9abea8c57 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include #endif #include @@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { "this class."; AT_ERROR(err.str()); } else if (ivalue.isRRef()) { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC TORCH_CHECK( torch::distributed::rpc::getAllowJitRRefPickle() == true, "RRef jit pickling is only allowed inside RPC calls."); @@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) { } } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC void Pickler::pushRRef(const IValue& ivalue) { // It is the same as how rref is pickled in python, see PyRRef::pickle auto rrefInterface = ivalue.toRRef(); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index c416f9641023..9b8fce0b4869 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC #include #endif #include @@ -549,7 +549,7 @@ void Unpickler::readGlobal( stack_.emplace_back(int64_t(globals_.size() - 1)); return; } else if (module_name == "torch.distributed.rpc" && class_name == "rref") { -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC return rebuildRRef(); #else TORCH_INTERNAL_ASSERT( @@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) { }); } -#ifdef USE_DISTRIBUTED +#ifdef USE_RPC void Unpickler::rebuildRRef() { globals_.emplace_back([this] { // It is the same as how rref is unpickled in python, diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h index 6d672ee86cd5..093d043ecf7d 100644 --- a/torch/csrc/utils/future.h +++ b/torch/csrc/utils/future.h @@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception { // Most implementation is copied from FutureMessage and // c10::ivalue::Future template -class TORCH_API Future final { +class TORCH_PYTHON_API Future final { public: Future() = default; diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py index 292634580aab..4545aea2bf56 100644 --- a/torch/distributed/rendezvous.py +++ b/torch/distributed/rendezvous.py @@ -6,9 +6,12 @@ import torch._six as six import numbers import os -from . import FileStore, TCPStore +import sys +from . import FileStore from .constants import default_pg_timeout +if sys.platform != 'win32': + from . import TCPStore _rendezvous_handlers = {} @@ -90,6 +93,10 @@ def _error(msg): result = urlparse(url) path = result.path + if sys.platform == 'win32': + import urllib.request + path = urllib.request.url2pathname(result.path) + if not path: raise _error("path missing") query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) @@ -175,7 +182,8 @@ def _env_error(var): # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using env:// method") +if sys.platform != 'win32': + register_rendezvous_handler("tcp", _tcp_rendezvous_handler) + register_rendezvous_handler("env", _env_rendezvous_handler) register_rendezvous_handler("file", _file_rendezvous_handler) -register_rendezvous_handler("tcp", _tcp_rendezvous_handler) -register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 68fe49f411f5..4b206f380111 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -45,15 +45,16 @@ endfunction() set(C10D_SRCS FileStore.cpp - HashStore.cpp ProcessGroup.cpp - ProcessGroupRoundRobin.cpp Store.cpp PrefixStore.cpp - TCPStore.cpp Utils.cpp ) +if(NOT WIN32) + list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp) +endif() + set(C10D_LIBS torch) if(USE_C10D_NCCL) @@ -77,14 +78,17 @@ endif() add_library(c10d STATIC ${C10D_SRCS}) set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET c10d PROPERTY CXX_STANDARD 14) -target_compile_options(c10d PUBLIC - -Wall - -Wextra - -Wno-unused-parameter - -Wno-missing-field-initializers - -Wno-write-strings - -Wno-unknown-pragmas - ) + +if(NOT MSVC) + target_compile_options(c10d PUBLIC + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + ) +endif() add_dependencies(c10d torch) @@ -118,17 +122,19 @@ if(USE_C10D_GLOO) endif() copy_header(FileStore.hpp) -copy_header(HashStore.hpp) copy_header(PrefixStore.hpp) copy_header(ProcessGroup.hpp) copy_header(Store.hpp) -copy_header(TCPStore.hpp) copy_header(Types.hpp) copy_header(Utils.hpp) if(USE_GLOO) copy_header(ProcessGroupGloo.hpp) copy_header(GlooDeviceFactory.hpp) endif() +if(NOT WIN32) + copy_header(HashStore.hpp) + copy_header(TCPStore.hpp) +endif() if(USE_C10D_NCCL) copy_header(ProcessGroupNCCL.hpp) diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp index 55346e0fa635..eb25c52f787a 100644 --- a/torch/lib/c10d/FileStore.cpp +++ b/torch/lib/c10d/FileStore.cpp @@ -3,9 +3,16 @@ #include #include #include -#include #include + +#ifdef _WIN32 +#include +#include +#include +#else +#include #include +#endif #include #include @@ -21,6 +28,40 @@ throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \ } +#ifdef _WIN32 +#define LOCK_EX 0x00000001 +#define LOCK_SH 0x00000010 +#define LOCK_UN 0x00000100 + +int flock_(int fd, int op) { + HANDLE hdl = (HANDLE) _get_osfhandle(fd); + DWORD low = 1, high = 0; + OVERLAPPED offset = {0, 0, 0, 0, NULL}; + + if (hdl < 0) + return -1; + + switch (op) { + case LOCK_EX: + if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset)) + return 0; + break; + case LOCK_SH: + if (LockFileEx(hdl, 0, 0, low, high, &offset)) + return 0; + break; + case LOCK_UN: + if(UnlockFileEx(hdl, 0, low, high, &offset) != 0) + return 0; + break; + default: + break; + } + errno = EINVAL; + return -1; +} +#endif + namespace c10d { namespace { @@ -79,7 +120,11 @@ class Lock { int fd_{-1}; void flock(int operation) { +#ifdef _WIN32 + auto rv = syscall(std::bind(::flock_, fd_, operation)); +#else auto rv = syscall(std::bind(::flock, fd_, operation)); +#endif SYSASSERT(rv, "flock"); } }; @@ -92,7 +137,11 @@ class File { std::chrono::milliseconds timeout) { const auto start = std::chrono::steady_clock::now(); while (true) { +#ifdef _WIN32 + fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE)); +#else fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644)); +#endif // Only retry when the file doesn't exist, since we are waiting for the // file to be created in this case to address the following issue: // https://github.com/pytorch/pytorch/issues/13750 diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp index 70c3c2bb7a31..dca6b03eb9dd 100644 --- a/torch/lib/c10d/GlooDeviceFactory.cpp +++ b/torch/lib/c10d/GlooDeviceFactory.cpp @@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING( #if GLOO_HAVE_TRANSPORT_TCP static std::shared_ptr<::gloo::transport::Device> makeTCPDevice( - const std::string& interface, + const std::string& interfaceName, const std::string& hostname) { TORCH_CHECK( - !interface.empty() || !hostname.empty(), + !interfaceName.empty() || !hostname.empty(), "GlooDeviceFactory::makeTCPDevice(): interface or hostname " "can't be empty"); ::gloo::transport::tcp::attr attr; - if (!interface.empty()) { - attr.iface = interface; + if (!interfaceName.empty()) { + attr.iface = interfaceName; } else { attr.hostname = hostname; } @@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice); #if GLOO_HAVE_TRANSPORT_UV static std::shared_ptr<::gloo::transport::Device> makeUVDevice( - const std::string& interface, + const std::string& interfaceName, const std::string& hostname) { TORCH_CHECK( - !interface.empty() || !hostname.empty(), + !interfaceName.empty() || !hostname.empty(), "GlooDeviceFactory::makeUVDevice(): interface or hostname " "can't be empty"); ::gloo::transport::uv::attr attr; - if (!interface.empty()) { - attr.iface = interface; + if (!interfaceName.empty()) { + attr.iface = interfaceName; } else { attr.hostname = hostname; } @@ -81,23 +81,28 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice( // the flexibility of other application to override by priority. Register // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override. C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice); +C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice); C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice); #endif static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT"); std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: - makeDeviceForInterface(const std::string& interface) { + makeDeviceForInterface(const std::string& interfaceName) { if (glooDeviceTransport) { - return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, ""); + return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, ""); } #ifdef __linux__ - return GlooDeviceRegistry()->Create("LINUX", interface, ""); + return GlooDeviceRegistry()->Create("LINUX", interfaceName, ""); #endif #ifdef __APPLE__ - return GlooDeviceRegistry()->Create("APPLE", interface, ""); + return GlooDeviceRegistry()->Create("APPLE", interfaceName, ""); +#endif + +#ifdef _WIN32 + return GlooDeviceRegistry()->Create("WIN32", interfaceName, ""); #endif throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device"); @@ -117,6 +122,10 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: return GlooDeviceRegistry()->Create("APPLE", "", hostname); #endif +#ifdef _WIN32 + return GlooDeviceRegistry()->Create("WIN32", "", hostname); +#endif + throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device"); } diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 531fe751f1c9..c139ac7a34fd 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -2,10 +2,16 @@ #include +#ifdef _WIN32 +#include +#include +#include +#else #include #include -#include #include +#endif +#include #include @@ -36,6 +42,36 @@ #include #include +#ifdef _WIN32 +#define GENERATE_ALL_TYPES(type, func, ...) \ + switch (type) { \ + case ::at::ScalarType::Float: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Double: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Half: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Char: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Byte: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Int: \ + func(__VA_ARGS__); \ + break; \ + case ::at::ScalarType::Long: \ + func(__VA_ARGS__); \ + break; \ + default: \ + throw std::runtime_error("Invalid scalar type"); \ + } + +#define HOST_NAME_MAX 256 +#else #define GENERATE_ALL_TYPES(type, func, args...) \ switch (type) { \ case ::at::ScalarType::Float: \ @@ -62,6 +98,7 @@ default: \ throw std::runtime_error("Invalid scalar type"); \ } +#endif namespace c10d { @@ -409,12 +446,19 @@ ProcessGroupGloo::Options::Options() namespace { +void socketInitialize() { +#ifdef _WIN32 + ::gloo::init_winsock(); +#endif +} + // Gloo assumes that this machine's hostname can always be resolved // to an address. If it doesn't it throws a runtime error saying // that it can't be resolved. Instead of catching it, we choose // to proactively check if an address can be resolved, so we can // gracefully fall back to an alternative if it doesn't. bool doesHostnameResolveToUsableAddress(const std::string& hostname) { + socketInitialize(); struct addrinfo hints; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; @@ -431,7 +475,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { continue; } rv = bind(fd, rp->ai_addr, rp->ai_addrlen); +#ifdef _WIN32 + closesocket(fd); +#else close(fd); +#endif if (rv == -1) { continue; } @@ -443,14 +491,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { } // namespace -#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: - createDeviceForInterface(const std::string& interface) { - return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface); + createDeviceForInterface(const std::string& interface_name) { + return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name); } -#endif -#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDeviceForHostname(const std::string& hostname) { TORCH_CHECK( @@ -460,14 +505,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: " to a (local) address"); return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname); } -#endif -#ifdef __linux__ +#if defined(__linux__) || defined(_WIN32) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDefaultDevice() { // Use the hostname to resolve the network address to // use. Note: if the hostname does not resolve to an address (e.g. // because of misconfigured /etc/hosts file), this will not work. + socketInitialize(); std::array hostname{}; auto rv = gethostname(hostname.data(), HOST_NAME_MAX); if (rv != 0) { diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp index d975f6eb6bc5..6c6e941ef95d 100644 --- a/torch/lib/c10d/Utils.cpp +++ b/torch/lib/c10d/Utils.cpp @@ -1,5 +1,6 @@ #include +#ifndef _WIN32 #include #include @@ -354,6 +355,6 @@ std::tuple accept( return std::make_tuple( socket, sockaddrToString(reinterpret_cast(&addr))); } - } // namespace tcputil } // namespace c10d +#endif diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 1bdaddde9f24..1116cd39ba1c 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -1,6 +1,8 @@ #pragma once +#ifndef _WIN32 #include +#endif #include #include @@ -480,6 +482,7 @@ class ResourceGuard { bool released_; }; +#ifndef _WIN32 namespace tcputil { constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1); @@ -609,4 +612,5 @@ std::tuple accept( const std::chrono::milliseconds& timeout = kNoTimeout); } // namespace tcputil +#endif } // namespace c10d diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index 8429d1099b29..003f56f30861 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -8,14 +8,19 @@ function(c10d_add_test test_src) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) - target_link_libraries(${test_name} pthread ${ARGN}) - target_compile_options(${test_name} PRIVATE -Wno-error) + target_link_libraries(${test_name} ${ARGN}) + if(NOT WIN32) + target_link_libraries(${test_name} pthread) + target_compile_options(${test_name} PRIVATE -Wno-error) + endif() add_test(NAME ${test_name} COMMAND $) endfunction() c10d_add_test(FileStoreTest.cpp c10d gtest_main) -c10d_add_test(HashStoreTest.cpp c10d gtest_main) -c10d_add_test(TCPStoreTest.cpp c10d gtest_main) +if(NOT WIN32) + c10d_add_test(HashStoreTest.cpp c10d gtest_main) + c10d_add_test(TCPStoreTest.cpp c10d gtest_main) +endif() if(USE_CUDA) if(USE_C10D_GLOO) @@ -29,7 +34,7 @@ if(USE_CUDA) endif() else() if(USE_C10D_GLOO) - c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main) + c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main) endif() endif() diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp index defaff895a18..328da2faf648 100644 --- a/torch/lib/c10d/test/CUDATest.hpp +++ b/torch/lib/c10d/test/CUDATest.hpp @@ -5,9 +5,15 @@ namespace c10d { namespace test { -void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); +#ifdef _WIN32 +#define EXPORT_TEST_API __declspec(dllexport) +#else +#define EXPORT_TEST_API +#endif -int cudaNumDevices(); +EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); + +EXPORT_TEST_API int cudaNumDevices(); } // namespace test } // namespace c10d diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp index 77215f4521c2..cc8da6326091 100644 --- a/torch/lib/c10d/test/FileStoreTest.cpp +++ b/torch/lib/c10d/test/FileStoreTest.cpp @@ -1,6 +1,8 @@ #include +#ifndef _WIN32 #include +#endif #include #include @@ -10,6 +12,11 @@ #include #include +#ifdef _WIN32 +std::string tmppath() { + return c10d::test::autoGenerateTmpFilePath(); +} +#else std::string tmppath() { const char* tmpdir = getenv("TMPDIR"); if (tmpdir == nullptr) { @@ -29,6 +36,7 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } +#endif void testGetSet(std::string path, std::string prefix = "") { // Basic Set/Get on File Store diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index 6606e553e733..da4f9b5fc106 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -1,7 +1,10 @@ +#ifndef _WIN32 #include -#include #include #include +#endif + +#include #include #include @@ -21,6 +24,7 @@ using namespace c10d::test; constexpr auto kSendDelay = std::chrono::milliseconds(100); constexpr auto kWaitTimeout = std::chrono::milliseconds(1); +#ifndef _WIN32 class SignalTest { public: SignalTest(const std::string& path) : path_(path) {} @@ -92,6 +96,7 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( test.arm(fork.pid, signal); return test.run(0, 2); } +#endif class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo { public: @@ -456,6 +461,7 @@ void testRecv(const std::string& path) { EXPECT_TRUE(recvCompleted); } +#ifndef _WIN32 TEST(ProcessGroupGlooTest, testSIGSTOPException) { // test SIGSTOP // Fork() and TSAN don't play well together, so skip the test if we're testing @@ -485,6 +491,7 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) { EXPECT_FALSE(work->isSuccess()); EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception); } +#endif TEST(ProcessGroupGlooTest, testAllReduceCPU) { { diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp index c62695485573..5f5dfca315cb 100644 --- a/torch/lib/c10d/test/TestUtils.hpp +++ b/torch/lib/c10d/test/TestUtils.hpp @@ -1,9 +1,12 @@ #pragma once +#ifndef _WIN32 #include -#include #include #include +#endif + +#include #include #include @@ -37,6 +40,28 @@ class Semaphore { std::condition_variable cv_; }; +#ifdef _WIN32 +std::string autoGenerateTmpFilePath() { + char tmp[L_tmpnam_s]; + errno_t err; + err = tmpnam_s(tmp, L_tmpnam_s); + if (err != 0) + { + throw std::system_error(errno, std::system_category()); + } + return std::string(tmp); +} + +std::string tmppath() { + const char* tmpfile = getenv("TMPFILE"); + if (tmpfile) { + return std::string(tmpfile); + } + else { + return autoGenerateTmpFilePath(); + } +} +#else std::string tmppath() { // TMPFILE is for manual test execution during which the user will specify // the full temp file path using the environmental variable TMPFILE @@ -63,6 +88,7 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } +#endif bool isTSANEnabled() { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); @@ -80,6 +106,7 @@ struct TemporaryFile { } }; +#ifndef _WIN32 struct Fork { pid_t pid; @@ -101,6 +128,7 @@ struct Fork { return pid == 0; } }; +#endif } // namespace test } // namespace c10d diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index f8e5b4822bd8..b2cd30c66812 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -16,7 +16,7 @@ import torch.distributed as c10d from functools import partial, reduce -from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM +from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA class TestSkip(NamedTuple): exit_code: int @@ -143,10 +143,23 @@ def wrapper(*args, **kwargs): return wrapper +def skip_if_win32(): + return unittest.skipIf( + sys.platform == 'win32', + "This unit test case is not supportted on Windows platform", + ) + TIMEOUT_DEFAULT = 100 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400} +def create_device(interface=None): + if sys.platform == 'win32' or interface is None: + return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1") + else: + return c10d.ProcessGroupGloo.create_device(interface=interface) + + def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) @@ -206,7 +219,7 @@ def initialize_temp_directories(init_method=None): if init_method is not None: os.environ["INIT_METHOD"] = init_method else: - os.environ["INIT_METHOD"] = "file://" + os.path.join( + os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join( init_dir_path, "shared_init_file" ) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 9959551031ff..36434ff8aa2f 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -53,6 +53,10 @@ torch.backends.disable_global_flags() +FILE_SCHEMA = "file://" +if sys.platform == 'win32': + FILE_SCHEMA = "file:///" + IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle' class ProfilingMode(Enum): diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py index b88765211df1..93de304a53ca 100644 --- a/torch/testing/_internal/dist_utils.py +++ b/torch/testing/_internal/dist_utils.py @@ -7,6 +7,7 @@ import torch.distributed as dist import torch.distributed.rpc as rpc from torch.distributed.rpc import _rref_context_get_debug_info # type: ignore[attr-defined] +from torch.testing._internal.common_utils import FILE_SCHEMA if not dist.is_available(): @@ -14,7 +15,7 @@ sys.exit(0) -INIT_METHOD_TEMPLATE = "file://{file_name}" +INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}" def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True, diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py index 1b1f755ed4cc..09db831e9999 100644 --- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py +++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py @@ -20,7 +20,7 @@ skip_if_lt_x_gpu, skip_if_rocm, ) -from torch.testing._internal.dist_utils import dist_init +from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import ( RpcAgentTestFixture, ) @@ -329,7 +329,7 @@ def _remote_worker_process(self): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -346,7 +346,7 @@ def _trainer_process(self, rank: int): ) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool): gLogger.info("Running the master process...") dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method="file://{}".format(self.file_name), + init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index f6f2b9a6fbfb..af5e648f6acb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1,5 +1,4 @@ import copy -import fcntl import itertools import random import math @@ -22,6 +21,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember +from torch.testing._internal.common_utils import FILE_SCHEMA from torch.testing._internal.common_distributed import ( MultiProcessTestCase, TEST_SKIPS, @@ -43,6 +43,10 @@ except ImportError: HAS_TORCHVISION = False +if sys.platform == 'win32': + import msvcrt +else: + import fcntl class Foo: def __init__(self, x): @@ -191,10 +195,17 @@ def _lock(): lockfile = os.path.join(TEMP_DIR, "lockfile") with open(lockfile, "w") as lf: try: - fcntl.flock(lf.fileno(), fcntl.LOCK_EX) - yield + if sys.platform == 'win32': + msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1) + yield + else: + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield finally: - fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + if sys.platform == 'win32': + msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) + else: + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) lf.close() @@ -270,7 +281,7 @@ def tearDown(self): @property def init_method(self): - return "file://{file_name}".format(file_name=self.file_name) + return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name) @classmethod def _run(cls, rank, test_name, file_name): @@ -2162,8 +2173,13 @@ def _test_DDP_5iter( # save the model in the middle and reload if test_save and idx == 2 and INIT_METHOD.startswith("file://"): with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) with tempfile.TemporaryFile() as tmp_file: torch.save(model_DDP, tmp_file) @@ -2192,8 +2208,13 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # dummy data initialization local_bs = len(gpu_subset) @@ -2350,8 +2371,13 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + if sys.platform == 'win32': + torch.save(model_DDP, tmp) + tmp.seek(0) + model_DDP = torch.load(tmp) + else: + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # data initialization input_cpu = torch.randn(global_bs, 2) From 31ae8117baec653f6d7688d33dbabc31be5378e1 Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Thu, 24 Sep 2020 22:00:37 -0700 Subject: [PATCH 007/292] [RFC] Remove per-op-registration related code in caffe2/tools/codegen/gen.py (#45134) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45134 Per-Op-Registration was a mechanism used for mobile selective build v0. Since then, a new dispathing mechanism has been built for PyTorch, and this code path isn't used any more. Remove it to simplify understanding/updating the code-generator's code-flow. ghstack-source-id: 112723942 Test Plan: `buck build` and sandcastle. Reviewed By: ezyang Differential Revision: D23806632 fbshipit-source-id: d93cd324650c541d9bfc8eeff2ddb2833b988ecc --- aten/src/ATen/templates/PerOpRegistration.cpp | 15 ------ tools/codegen/gen.py | 53 ++----------------- 2 files changed, 4 insertions(+), 64 deletions(-) delete mode 100644 aten/src/ATen/templates/PerOpRegistration.cpp diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp deleted file mode 100644 index 72ac3d784dad..000000000000 --- a/aten/src/ATen/templates/PerOpRegistration.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// ${generated_comment} - -#include -#include -#include -#include -$extra_headers - -namespace at { - -TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) { - ${function_registrations} -} - -} // namespace at diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index be8c57f1061a..83d9fa04cf37 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -2,7 +2,7 @@ import contextlib import textwrap import itertools -from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence +from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, Union, Sequence import yaml from enum import Enum from collections import OrderedDict @@ -914,11 +914,6 @@ def main() -> None: nargs='*', help='filter dispatch backend by the whitelist (if set), ' 'e.g.: CPU CUDA QuantizedCPU ...') - parser.add_argument( - '--per_op_registration', - action='store_true', - help='group function registrations by op name and write to separate files; ' - 'must also set --op_registration_whitelist param') parser.add_argument( '--force_schema_registration', action='store_true', @@ -1011,8 +1006,7 @@ def make_file_manager(install_dir: str) -> FileManager: 'function_registrations': list(mapMaybe( compute_type_method( dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions - )) if not options.per_op_registration else [], + native_functions)), }) del fm @@ -1037,11 +1031,11 @@ def make_file_manager(install_dir: str) -> FileManager: 'function_registrations': list(mapMaybe( compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions)) if not options.per_op_registration else [], + native_functions)), 'math_function_registrations': list(mapMaybe( compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions)) if not options.per_op_registration else [], + native_functions)), }) cpu_fm.write('Functions.h', lambda: { 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), @@ -1080,45 +1074,6 @@ def computeSchemaRegister() -> Dict[str, object]: } cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister) - if options.per_op_registration: - def gen_per_op_registration_filename(opname: str) -> str: - return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-')) - - if op_registration_whitelist is None: - raise Exception("Must set --op_registration_whitelist for per-op registration.") - - # First, group all native functions by unoverloaded operator name - grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list) - for f in native_functions: - grouped_functions[f"aten::{f.func.name.name}"].append(f) - extra_headers = [] - for b in backends: - extra_headers.append(f'#include ') - - # Next, generate registration for each one - for name in op_registration_whitelist: - def computePerOpRegistration() -> Dict[str, object]: - fs = grouped_functions[name] - registrations: List[str] = [] - for mb_dispatch in itertools.chain([None], backends): - # or you could pass in op_registration_whitelist, it doesn't - # matter! - # NB: Use of compute_type_method here is kind of an abuse; - # this is why we have to unconditionally write in - # torch::dispatch in the registration when it should be - # contextually clear - registrations.extend( - mapMaybe( - compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None), - fs)) - return { - 'extra_headers': extra_headers, - 'function_registrations': registrations, - } - - cpu_fm.write_with_template( - gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration) - cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions)))) if options.output_dependencies: From bc3151dee0b73e10c64788fce2d822e96aeffb4a Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 24 Sep 2020 22:10:52 -0700 Subject: [PATCH 008/292] [quant] Remove unused qconfig argument in qat linear module (#45307) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45307 fixes: https://github.com/pytorch/pytorch/issues/35634 Test Plan: Imported from OSS Reviewed By: vkuzo Differential Revision: D23917339 fbshipit-source-id: 65f8844b98198bbf93547b3d71408c2a54605218 --- torch/nn/intrinsic/qat/modules/conv_fused.py | 17 ++++++++--------- torch/nn/intrinsic/qat/modules/linear_relu.py | 4 ++-- torch/nn/qat/modules/conv.py | 7 +++---- torch/nn/qat/modules/linear.py | 7 +++---- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py index db46bb5ac2ee..5a8b0f042db1 100644 --- a/torch/nn/intrinsic/qat/modules/conv_fused.py +++ b/torch/nn/intrinsic/qat/modules/conv_fused.py @@ -162,7 +162,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -170,10 +170,9 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' - qconfig = mod.qconfig + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' + qconfig = mod.qconfig conv, bn = mod[0], mod[1] qat_convbn = cls(conv.in_channels, conv.out_channels, conv.kernel_size, conv.stride, conv.padding, conv.dilation, @@ -278,8 +277,8 @@ def forward(self, input): return F.relu(ConvBn2d._forward(self, input)) @classmethod - def from_float(cls, mod, qconfig=None): - return super(ConvBnReLU2d, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(ConvBnReLU2d, cls).from_float(mod) class ConvReLU2d(nnqat.Conv2d): r""" @@ -313,8 +312,8 @@ def forward(self, input): self._conv_forward(input, self.weight_fake_quant(self.weight))) @classmethod - def from_float(cls, mod, qconfig=None): - return super(ConvReLU2d, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(ConvReLU2d, cls).from_float(mod) def update_bn_stats(mod): if type(mod) in set([ConvBnReLU2d, ConvBn2d]): diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py index 03f556c4ac2e..b11072ddb7be 100644 --- a/torch/nn/intrinsic/qat/modules/linear_relu.py +++ b/torch/nn/intrinsic/qat/modules/linear_relu.py @@ -34,5 +34,5 @@ def forward(self, input): return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias)) @classmethod - def from_float(cls, mod, qconfig=None): - return super(LinearReLU, cls).from_float(mod, qconfig) + def from_float(cls, mod): + return super(LinearReLU, cls).from_float(mod) diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py index 63fb4b0fa1fd..7daeecddd4e1 100644 --- a/torch/nn/qat/modules/conv.py +++ b/torch/nn/qat/modules/conv.py @@ -32,7 +32,7 @@ def forward(self, input): return self._conv_forward(input, self.weight_fake_quant(self.weight)) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -40,9 +40,8 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' if type(mod) == ConvReLU2d: mod = mod[0] qconfig = mod.qconfig diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py index 77998426239f..47fc40b9b6c0 100644 --- a/torch/nn/qat/modules/linear.py +++ b/torch/nn/qat/modules/linear.py @@ -30,7 +30,7 @@ def forward(self, input): return F.linear(input, self.weight_fake_quant(self.weight), self.bias) @classmethod - def from_float(cls, mod, qconfig=None): + def from_float(cls, mod): r"""Create a qat module from a float module or qparams_dict Args: `mod` a float module, either produced by torch.quantization utilities @@ -38,9 +38,8 @@ def from_float(cls, mod, qconfig=None): """ assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \ cls._FLOAT_MODULE.__name__ - if not qconfig: - assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' - assert mod.qconfig, 'Input float module must have a valid qconfig' + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' if type(mod) == LinearReLU: mod = mod[0] From 103fa3894a0dff4bd697688a4a5d6095cd45162e Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 24 Sep 2020 22:42:46 -0700 Subject: [PATCH 009/292] Revert D23841786: [pytorch][PR] Enable distributed package on windows, Gloo backend supported only Test Plan: revert-hammer Differential Revision: D23841786 (https://github.com/pytorch/pytorch/commit/0122299f9ba729aa0c9bd43764af53225e03672c) Original commit changeset: 334ba1ed73ef fbshipit-source-id: ec95432f9957df56a5a04e52661f5db920b7f57f --- .../install_miniconda3.bat | 7 --- CMakeLists.txt | 8 +-- caffe2/CMakeLists.txt | 49 ++++++--------- cmake/Dependencies.cmake | 5 +- test/cpp/dist_autograd/CMakeLists.txt | 2 +- test/distributed/test_c10d.py | 49 +++++---------- test/distributed/test_c10d_spawn.py | 8 +-- test/run_test.py | 11 ++-- tools/build_variables.bzl | 7 +-- torch/CMakeLists.txt | 33 +++++----- torch/csrc/Module.cpp | 4 +- torch/csrc/WindowsTorchApiMacro.h | 6 -- torch/csrc/distributed/c10d/comm.h | 4 +- torch/csrc/distributed/c10d/init.cpp | 10 +-- torch/csrc/distributed/c10d/reducer.cpp | 22 ++++--- torch/csrc/distributed/c10d/reducer.h | 14 ----- torch/csrc/jit/python/pybind_utils.h | 8 +-- .../csrc/jit/python/python_sugared_value.cpp | 2 +- torch/csrc/jit/runtime/interpreter.cpp | 8 +-- torch/csrc/jit/serialization/pickler.cpp | 6 +- torch/csrc/jit/serialization/unpickler.cpp | 6 +- torch/csrc/utils/future.h | 2 +- torch/distributed/rendezvous.py | 14 +---- torch/lib/c10d/CMakeLists.txt | 32 ++++------ torch/lib/c10d/FileStore.cpp | 51 +--------------- torch/lib/c10d/GlooDeviceFactory.cpp | 33 ++++------ torch/lib/c10d/ProcessGroupGloo.cpp | 61 +++---------------- torch/lib/c10d/Utils.cpp | 3 +- torch/lib/c10d/Utils.hpp | 4 -- torch/lib/c10d/test/CMakeLists.txt | 15 ++--- torch/lib/c10d/test/CUDATest.hpp | 10 +-- torch/lib/c10d/test/FileStoreTest.cpp | 8 --- torch/lib/c10d/test/ProcessGroupGlooTest.cpp | 9 +-- torch/lib/c10d/test/TestUtils.hpp | 30 +-------- torch/testing/_internal/common_distributed.py | 17 +----- torch/testing/_internal/common_utils.py | 4 -- torch/testing/_internal/dist_utils.py | 3 +- .../ddp_under_dist_autograd_test.py | 16 ++--- .../_internal/distributed/distributed_test.py | 48 ++++----------- 39 files changed, 167 insertions(+), 462 deletions(-) diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index cf7255ce3789..a66ef4b651c5 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -12,11 +12,4 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if "%REBUILD%"=="" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 call conda install -y -q -c conda-forge cmake - call conda install -y -q -c rdonnelly libuv ) - -:: Get installed libuv path -@echo off -set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library -@echo on -echo libuv_ROOT=%libuv_ROOT% diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d937e0e1655..826c187b602e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ endif() # For non-supported platforms, turn USE_DISTRIBUTED off by default. # It is not tested and likely won't work without additional changes. -if(NOT LINUX AND NOT WIN32) +if(NOT LINUX) set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed") # On macOS, if USE_DISTRIBUTED is enabled (specified by the user), # then make Gloo build with the libuv transport. @@ -226,12 +226,6 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF) -# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected -if(WIN32) - set(USE_TENSORPIPE OFF) - message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF") -endif() - # Linux distributions do not want too many embedded sources, in that sense we # need to be able to build pytorch with an (almost) empty third_party # directory. diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 219b28c69695..65f072b6f29d 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -291,29 +291,26 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(USE_DISTRIBUTED) + add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") + target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) + add_dependencies(process_group_agent torch c10d) # Define this target even if we're building without TensorPipe, to make life # easier to other targets that depend on this. However, in that case, by not # setting the USE_TENSORPIPE compile definition, this target will just end # up being empty. Downstream targets should also add a #ifdef guard. - if(NOT WIN32) - add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") - target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) - add_dependencies(process_group_agent torch c10d) - - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch c10d) - if(USE_TENSORPIPE) - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) - endif() + add_library(tensorpipe_agent + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" + ) + target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) + add_dependencies(tensorpipe_agent torch c10d) + if(USE_TENSORPIPE) + target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) + target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) + add_dependencies(tensorpipe_agent tensorpipe) endif() endif() @@ -496,7 +493,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" ) endif() - if(USE_DISTRIBUTED AND NOT WIN32) + if(USE_DISTRIBUTED) append_filelist("libtorch_distributed_sources" TORCH_SRCS) endif() endif() @@ -840,7 +837,7 @@ endif() if(BUILD_TEST AND NOT USE_ROCM) add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr) - if(USE_DISTRIBUTED AND NOT WIN32) + if(USE_DISTRIBUTED) add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc) endif() endif() @@ -892,7 +889,9 @@ endif() DESTINATION share/cmake/Torch) if(USE_DISTRIBUTED) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) + if(NOT MSVC) + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) + endif() endif() @@ -967,14 +966,6 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED ) - # Pass USE_RPC in order to reduce use of - # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) - # need to be removed when RPC is supported - if(NOT WIN32) - target_compile_definitions(torch_cpu PRIVATE - USE_RPC - ) - endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 023bbe9e8d07..028098f61d36 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1253,7 +1253,10 @@ if(USE_CUDA) endif() if(USE_GLOO) - if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(MSVC) + message(WARNING "Gloo can not be used on Windows.") + caffe2_update_option(USE_GLOO OFF) + elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt index 9969c63e16d5..5d23602881f0 100644 --- a/test/cpp/dist_autograd/CMakeLists.txt +++ b/test/cpp/dist_autograd/CMakeLists.txt @@ -1,4 +1,4 @@ -if(USE_DISTRIBUTED AND NOT WIN32) +if(USE_DISTRIBUTED) set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd") set(DIST_AUTOGRAD_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 911a73ce432e..a81bc53f175a 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,7 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests, skip_if_win32, create_device + simple_sparse_reduce_tests from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -255,7 +255,6 @@ def create_tcp_store(addr): raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports)) -@skip_if_win32() class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): store = create_tcp_store('localhost') @@ -274,7 +273,6 @@ def test_address_already_in_use(self): store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 -@skip_if_win32() class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): super(PrefixTCPStoreTest, self).setUp() @@ -331,7 +329,6 @@ def test_unknown_handler(self): c10d.rendezvous('invalid://') -@skip_if_win32() class RendezvousEnvTest(TestCase): @retry_on_connect_failures def test_common_errors(self): @@ -458,7 +455,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile(delete=False) as file: - url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' + url = 'file://%s?world_size=%d' % (file.name, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -477,7 +474,6 @@ def test_nominal(self): self.assertEqual(b"value1", store0.get("key1")) -@skip_if_win32() class RendezvousTCPTest(TestCase): def create_tcp_url(self): @@ -548,13 +544,9 @@ def _test_store_timeout(self, backend, init_method, c2p): def _init_methods(self): f = tempfile.NamedTemporaryFile(delete=False) - if sys.platform == 'win32': - yield "file:///%s" % f.name.replace("\\", "/") - f.close() - else: - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() def _test_default_store_timeout(self, backend): for init_method in self._init_methods(): @@ -592,16 +584,11 @@ def test_default_store_timeout_gloo(self): class ProcessGroupGlooTest(MultiProcessTestCase): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - - # For Windows platform, Python does not support fork, change it to spawn here. - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [create_device(interface=LOOPBACK)] + opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] opts.timeout = 5.0 opts.threads = threads return opts @@ -611,8 +598,8 @@ def test_multi_device_constructor(self): opts = c10d.ProcessGroupGloo.Options() opts.timeout = 5.0 opts.devices = [ - create_device(interface=LOOPBACK), - create_device(interface=LOOPBACK), + c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), + c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), ] pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) @@ -1527,7 +1514,6 @@ def test_barrier_implies_wait(self): for i, tensor in enumerate(tensors): self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) - @skip_if_win32() def test_round_robin(self): num_process_groups = 2 store = c10d.FileStore(self.file_name, self.world_size) @@ -1545,7 +1531,6 @@ def test_round_robin(self): pg.broadcast(tensor, root=0).wait() self.assertEqual(torch.full([100, 100], 0.), tensor) - @skip_if_win32() def test_round_robin_create_destroy(self): store = c10d.FileStore(self.file_name, self.world_size) @@ -1974,10 +1959,7 @@ def forward(self, x): class DistributedDataParallelTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelTest, self).setUp() - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor @@ -2086,7 +2068,7 @@ def update_parameters(model): def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @@ -3965,10 +3947,7 @@ def test_nccl_timeout(self): class CommTest(MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - if sys.platform == 'win32': - self._spawn_processes() - else: - self._fork_processes() + self._fork_processes() def tearDown(self): super(CommTest, self).tearDown() @@ -4034,7 +4013,7 @@ def test_broadcast_coalesced_nccl(self): def test_broadcast_coalesced_gloo_cuda(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cuda:%d" % self.rank) ranks = list(range(self.world_size)) @@ -4045,7 +4024,7 @@ def test_broadcast_coalesced_gloo_cuda(self): def test_broadcast_coalesced_gloo_cpu(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [create_device(interface=LOOPBACK)] + options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cpu") ranks = list(range(self.world_size)) diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index c84608e8f178..d0bf00b8a08a 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -10,10 +10,8 @@ import torch.nn as nn from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo, \ - create_device -from torch.testing._internal.common_utils import TestCase, load_tests, \ - run_tests, skipIfRocm +from torch.testing._internal.common_distributed import requires_gloo +from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN @@ -41,7 +39,7 @@ class ProcessGroupShareTensorTest(TestCase): @classmethod def opts(cls, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [create_device(interface='lo')] + opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")] opts.timeout = 5.0 opts.threads = threads return opts diff --git a/test/run_test.py b/test/run_test.py index 0f9d14a78605..d63fc372f9c2 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,7 +13,7 @@ import torch import torch._six from torch.utils import cpp_extension -from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA +from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell import torch.distributed as dist from typing import Dict, Optional @@ -99,6 +99,7 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', ] ROCM_BLOCKLIST = [ @@ -305,13 +306,9 @@ def test_distributed(test_module, test_directory, options): 'MPI not available -- MPI backend tests will be skipped') config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): - if sys.platform == 'win32' and backend != 'gloo': - continue if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: - if sys.platform == 'win32' and not with_init_file: - continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" @@ -325,9 +322,9 @@ def test_distributed(test_module, test_directory, options): os.environ.update(env_vars) if with_init_file: if test_module in ["test_distributed_fork", "test_distributed_spawn"]: - init_method = f'{FILE_SCHEMA}{tmp_dir}/' + init_method = 'file://{}/'.format(tmp_dir) else: - init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' + init_method = 'file://{}/shared_init_file'.format(tmp_dir) os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index c21fab8ec2cf..174bb858da44 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -537,14 +537,11 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", ] -libtorch_python_distributed_core_sources = [ +libtorch_python_distributed_sources = [ + "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", -] - -libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ - "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/rpc/init.cpp", "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 2ae2f7f737fe..b78dc4a362a7 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -160,28 +160,25 @@ endif() if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) - if(WIN32) - append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) - else() - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) + if(NOT MSVC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) + list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() endif() - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) endif() -if(USE_NCCL AND NOT WIN32) +if(USE_NCCL) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ae6f15155f2a..ed4aa21a8f76 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -688,9 +688,9 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#if defined(USE_DISTRIBUTED) && defined(USE_C10D) +#ifdef USE_DISTRIBUTED +#ifdef USE_C10D THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); -#ifndef _WIN32 THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions()); THPUtils_addPyMethodDefs( methods, torch::distributed::autograd::python_functions()); diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 7f44db0baba9..7f8ef4e01677 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -5,9 +5,3 @@ // There's no difference between aten, torch and caffe2 libs any more // TODO: clean up the naming for consistency #define TORCH_API CAFFE2_API - -#ifdef _WIN32 -#define TORCH_PYTHON_API -#else -#define TORCH_PYTHON_API CAFFE2_API -#endif diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h index 2eb626c40232..e2b501f08aff 100644 --- a/torch/csrc/distributed/c10d/comm.h +++ b/torch/csrc/distributed/c10d/comm.h @@ -38,7 +38,7 @@ class GradBucket { // DDP's c10d reducer allows communication hooks defined as a sub class // of CommHookInterface. CommHookInterface is an abstract class and can // be used to implement both Python and CPP hooks. -struct TORCH_PYTHON_API CommHookInterface { +struct TORCH_API CommHookInterface { public: virtual ~CommHookInterface() {} @@ -59,7 +59,7 @@ struct TORCH_PYTHON_API CommHookInterface { // PythonCommHook enables registering a python hook to c10d reducer and is a // sub class of CommHookInterface. -class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { +class TORCH_API PythonCommHook : public CommHookInterface { public: // The constructor takes a state and a callable hook. Inputs are Python // objects. The state is passed to the hook in runHook function can be used to diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index be1752d7366f..165d6a1c8603 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,11 +1,7 @@ #include #include -#ifndef _WIN32 #include -#include -#include -#endif #include #ifdef USE_C10D_GLOO @@ -21,6 +17,8 @@ #endif #include +#include +#include #include #include @@ -325,7 +323,6 @@ They are used in specifying strategies for reduction collectives, e.g., shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store) .def(py::init()); -#ifndef _WIN32 shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store) .def(py::init<>()); @@ -343,7 +340,6 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("is_master"), py::arg("timeout") = std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); -#endif shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store) .def(py::init>()); @@ -611,7 +607,6 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("opts") = ::c10d::BarrierOptions(), py::call_guard()); -#ifndef _WIN32 module.def( "_round_robin_process_groups", [](std::vector> processGroups) @@ -625,7 +620,6 @@ They are used in specifying strategies for reduction collectives, e.g., }, py::arg("process_groups"), py::call_guard()); -#endif #ifdef USE_C10D_GLOO auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>( diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 814d3494ff4e..86916c7994dd 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -89,7 +89,10 @@ Reducer::Reducer( for (size_t variable_index = 0; variable_index < variable_count; variable_index++) { auto& variable = replicas_[replica_index][variable_index]; - const auto index = VariableIndex(replica_index, variable_index); + const auto index = VariableIndex{ + .replica_index = replica_index, + .variable_index = variable_index, + }; // The gradient accumulator function is lazily initialized once. // Therefore we can use its presence in the autograd graph as @@ -97,19 +100,15 @@ Reducer::Reducer( auto grad_accumulator = torch::autograd::impl::grad_accumulator(variable); -#ifndef _WIN32 using torch::distributed::autograd::ThreadLocalDistAutogradContext; -#endif // Hook to execute after the gradient accumulator has executed. hooks_.emplace_back( grad_accumulator->add_post_hook( torch::make_unique( [=](const torch::autograd::variable_list& outputs, const torch::autograd::variable_list& /* unused */) { -#ifndef _WIN32 this->rpc_context_.set( ThreadLocalDistAutogradContext::getContextPtr()); -#endif this->autograd_hook(index); return outputs; })), @@ -478,7 +477,10 @@ void Reducer::push_rebuilt_params_for_all_indices() { const auto variable_count = replicas_[replica_index].size(); for (size_t variable_index = 0; variable_index < variable_count; ++variable_index) { - const auto index = VariableIndex(replica_index, variable_index); + const auto index = VariableIndex{ + .replica_index = replica_index, + .variable_index = variable_index, + }; push_rebuilt_params(index); } } @@ -848,8 +850,10 @@ void Reducer::initialize_buckets( TORCH_CHECK( variable_index < variable_locators_.size(), "Out of range variable index specified."); - variable_locators_[variable_index] = VariableLocator( - bucket_index, intra_bucket_index++); + variable_locators_[variable_index] = VariableLocator{ + .bucket_index = bucket_index, + .intra_bucket_index = intra_bucket_index++, + }; } bucket.variable_indices = std::move(bucket_indices[bucket_index]); @@ -1231,9 +1235,7 @@ void Reducer::runGradCallbackForVariable( cb(variable.mutable_grad()); } else { // Under distributed autograd -#ifndef _WIN32 context_ptr->runGradCallbackForVariable(variable, std::move(cb)); -#endif } } diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 486b7337366a..960a32356acf 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -104,13 +104,6 @@ class Reducer { struct VariableIndex { size_t replica_index; size_t variable_index; - - VariableIndex() = default; - - VariableIndex(size_t replica_index_, size_t variable_index_) { - replica_index = replica_index_; - variable_index = variable_index_; - } }; void push_rebuilt_params(const VariableIndex& index); @@ -288,13 +281,6 @@ class Reducer { size_t bucket_index; // Index of parameter in single bucket replica. size_t intra_bucket_index; - - VariableLocator() = default; - - VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) { - bucket_index = bucket_index_; - intra_bucket_index = intra_bucket_index_; - } }; // Map the index of a variable to its location in the bucket structure. diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 4be55a9caa90..65f5a49145c8 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) { if (py::isinstance(input)) { auto object = py::cast(input); return InferredType(object.type()); -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED } else if (py::isinstance(input)) { auto rref_ivalue = input.cast().toIValue(); return InferredType(rref_ivalue.type()); @@ -716,7 +716,7 @@ inline IValue toIValue( } } case TypeKind::RRefType: { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return obj.cast().toIValue(); #else AT_ERROR("RRef is only supported with the distributed package"); @@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) { } return std::move(py_dict); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED auto RRefPtr = c10::dynamic_intrusive_pointer_cast( std::move(ivalue).toRRef()); @@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) { auto py_class = getScriptedClassOrError(qualified_class_name); return py_class.attr(enum_holder->name().c_str()); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return py::cast(torch::distributed::rpc::PyRRef( c10::static_intrusive_pointer_cast( ivalue.toRRef()))); diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 119b6b5e5de7..ba94d33f37b3 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -916,7 +916,7 @@ std::shared_ptr toSugaredValue( } else if ( obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) { return SpecialFormValue::create(prim::annotate); -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on. } else if ( obj.ptr() == diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index f61e2597447f..337fe66c0789 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -23,7 +23,7 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include using torch::distributed::autograd::DistAutogradContainer; #endif @@ -267,7 +267,7 @@ void insertLastUses(Graph& g) { } inline int64_t getDistAutogradContextId() { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return DistAutogradContainer::currentContextId(); #else return 0; @@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState( : pImpl(std::move(pImpl_)) {} void InterpreterContinuation::operator()() { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED auto prev_dist_id = DistAutogradContainer::currentContextId(); DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_); #endif @@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() { } else { state.runAsync(stack); } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED DistAutogradContainer::forceCurrentContextId(prev_dist_id); #endif } diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 2bc9abea8c57..6f911f4246cc 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include #endif #include @@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) { "this class."; AT_ERROR(err.str()); } else if (ivalue.isRRef()) { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED TORCH_CHECK( torch::distributed::rpc::getAllowJitRRefPickle() == true, "RRef jit pickling is only allowed inside RPC calls."); @@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) { } } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED void Pickler::pushRRef(const IValue& ivalue) { // It is the same as how rref is pickled in python, see PyRRef::pickle auto rrefInterface = ivalue.toRRef(); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 9b8fce0b4869..c416f9641023 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -1,6 +1,6 @@ #include #include -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED #include #endif #include @@ -549,7 +549,7 @@ void Unpickler::readGlobal( stack_.emplace_back(int64_t(globals_.size() - 1)); return; } else if (module_name == "torch.distributed.rpc" && class_name == "rref") { -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED return rebuildRRef(); #else TORCH_INTERNAL_ASSERT( @@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) { }); } -#ifdef USE_RPC +#ifdef USE_DISTRIBUTED void Unpickler::rebuildRRef() { globals_.emplace_back([this] { // It is the same as how rref is unpickled in python, diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h index 093d043ecf7d..6d672ee86cd5 100644 --- a/torch/csrc/utils/future.h +++ b/torch/csrc/utils/future.h @@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception { // Most implementation is copied from FutureMessage and // c10::ivalue::Future template -class TORCH_PYTHON_API Future final { +class TORCH_API Future final { public: Future() = default; diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py index 4545aea2bf56..292634580aab 100644 --- a/torch/distributed/rendezvous.py +++ b/torch/distributed/rendezvous.py @@ -6,12 +6,9 @@ import torch._six as six import numbers import os -import sys -from . import FileStore +from . import FileStore, TCPStore from .constants import default_pg_timeout -if sys.platform != 'win32': - from . import TCPStore _rendezvous_handlers = {} @@ -93,10 +90,6 @@ def _error(msg): result = urlparse(url) path = result.path - if sys.platform == 'win32': - import urllib.request - path = urllib.request.url2pathname(result.path) - if not path: raise _error("path missing") query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) @@ -182,8 +175,7 @@ def _env_error(var): # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using env:// method") -if sys.platform != 'win32': - register_rendezvous_handler("tcp", _tcp_rendezvous_handler) - register_rendezvous_handler("env", _env_rendezvous_handler) register_rendezvous_handler("file", _file_rendezvous_handler) +register_rendezvous_handler("tcp", _tcp_rendezvous_handler) +register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 4b206f380111..68fe49f411f5 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -45,16 +45,15 @@ endfunction() set(C10D_SRCS FileStore.cpp + HashStore.cpp ProcessGroup.cpp + ProcessGroupRoundRobin.cpp Store.cpp PrefixStore.cpp + TCPStore.cpp Utils.cpp ) -if(NOT WIN32) - list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp) -endif() - set(C10D_LIBS torch) if(USE_C10D_NCCL) @@ -78,17 +77,14 @@ endif() add_library(c10d STATIC ${C10D_SRCS}) set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET c10d PROPERTY CXX_STANDARD 14) - -if(NOT MSVC) - target_compile_options(c10d PUBLIC - -Wall - -Wextra - -Wno-unused-parameter - -Wno-missing-field-initializers - -Wno-write-strings - -Wno-unknown-pragmas - ) -endif() +target_compile_options(c10d PUBLIC + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + ) add_dependencies(c10d torch) @@ -122,19 +118,17 @@ if(USE_C10D_GLOO) endif() copy_header(FileStore.hpp) +copy_header(HashStore.hpp) copy_header(PrefixStore.hpp) copy_header(ProcessGroup.hpp) copy_header(Store.hpp) +copy_header(TCPStore.hpp) copy_header(Types.hpp) copy_header(Utils.hpp) if(USE_GLOO) copy_header(ProcessGroupGloo.hpp) copy_header(GlooDeviceFactory.hpp) endif() -if(NOT WIN32) - copy_header(HashStore.hpp) - copy_header(TCPStore.hpp) -endif() if(USE_C10D_NCCL) copy_header(ProcessGroupNCCL.hpp) diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp index eb25c52f787a..55346e0fa635 100644 --- a/torch/lib/c10d/FileStore.cpp +++ b/torch/lib/c10d/FileStore.cpp @@ -3,16 +3,9 @@ #include #include #include -#include - -#ifdef _WIN32 -#include -#include -#include -#else #include +#include #include -#endif #include #include @@ -28,40 +21,6 @@ throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \ } -#ifdef _WIN32 -#define LOCK_EX 0x00000001 -#define LOCK_SH 0x00000010 -#define LOCK_UN 0x00000100 - -int flock_(int fd, int op) { - HANDLE hdl = (HANDLE) _get_osfhandle(fd); - DWORD low = 1, high = 0; - OVERLAPPED offset = {0, 0, 0, 0, NULL}; - - if (hdl < 0) - return -1; - - switch (op) { - case LOCK_EX: - if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset)) - return 0; - break; - case LOCK_SH: - if (LockFileEx(hdl, 0, 0, low, high, &offset)) - return 0; - break; - case LOCK_UN: - if(UnlockFileEx(hdl, 0, low, high, &offset) != 0) - return 0; - break; - default: - break; - } - errno = EINVAL; - return -1; -} -#endif - namespace c10d { namespace { @@ -120,11 +79,7 @@ class Lock { int fd_{-1}; void flock(int operation) { -#ifdef _WIN32 - auto rv = syscall(std::bind(::flock_, fd_, operation)); -#else auto rv = syscall(std::bind(::flock, fd_, operation)); -#endif SYSASSERT(rv, "flock"); } }; @@ -137,11 +92,7 @@ class File { std::chrono::milliseconds timeout) { const auto start = std::chrono::steady_clock::now(); while (true) { -#ifdef _WIN32 - fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE)); -#else fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644)); -#endif // Only retry when the file doesn't exist, since we are waiting for the // file to be created in this case to address the following issue: // https://github.com/pytorch/pytorch/issues/13750 diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp index dca6b03eb9dd..70c3c2bb7a31 100644 --- a/torch/lib/c10d/GlooDeviceFactory.cpp +++ b/torch/lib/c10d/GlooDeviceFactory.cpp @@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING( #if GLOO_HAVE_TRANSPORT_TCP static std::shared_ptr<::gloo::transport::Device> makeTCPDevice( - const std::string& interfaceName, + const std::string& interface, const std::string& hostname) { TORCH_CHECK( - !interfaceName.empty() || !hostname.empty(), + !interface.empty() || !hostname.empty(), "GlooDeviceFactory::makeTCPDevice(): interface or hostname " "can't be empty"); ::gloo::transport::tcp::attr attr; - if (!interfaceName.empty()) { - attr.iface = interfaceName; + if (!interface.empty()) { + attr.iface = interface; } else { attr.hostname = hostname; } @@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice); #if GLOO_HAVE_TRANSPORT_UV static std::shared_ptr<::gloo::transport::Device> makeUVDevice( - const std::string& interfaceName, + const std::string& interface, const std::string& hostname) { TORCH_CHECK( - !interfaceName.empty() || !hostname.empty(), + !interface.empty() || !hostname.empty(), "GlooDeviceFactory::makeUVDevice(): interface or hostname " "can't be empty"); ::gloo::transport::uv::attr attr; - if (!interfaceName.empty()) { - attr.iface = interfaceName; + if (!interface.empty()) { + attr.iface = interface; } else { attr.hostname = hostname; } @@ -81,28 +81,23 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice( // the flexibility of other application to override by priority. Register // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override. C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice); -C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice); C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice); #endif static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT"); std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: - makeDeviceForInterface(const std::string& interfaceName) { + makeDeviceForInterface(const std::string& interface) { if (glooDeviceTransport) { - return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, ""); + return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, ""); } #ifdef __linux__ - return GlooDeviceRegistry()->Create("LINUX", interfaceName, ""); + return GlooDeviceRegistry()->Create("LINUX", interface, ""); #endif #ifdef __APPLE__ - return GlooDeviceRegistry()->Create("APPLE", interfaceName, ""); -#endif - -#ifdef _WIN32 - return GlooDeviceRegistry()->Create("WIN32", interfaceName, ""); + return GlooDeviceRegistry()->Create("APPLE", interface, ""); #endif throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device"); @@ -122,10 +117,6 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory:: return GlooDeviceRegistry()->Create("APPLE", "", hostname); #endif -#ifdef _WIN32 - return GlooDeviceRegistry()->Create("WIN32", "", hostname); -#endif - throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device"); } diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index c139ac7a34fd..531fe751f1c9 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -2,16 +2,10 @@ #include -#ifdef _WIN32 -#include -#include -#include -#else #include #include -#include -#endif #include +#include #include @@ -42,36 +36,6 @@ #include #include -#ifdef _WIN32 -#define GENERATE_ALL_TYPES(type, func, ...) \ - switch (type) { \ - case ::at::ScalarType::Float: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Double: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Half: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Char: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Byte: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Int: \ - func(__VA_ARGS__); \ - break; \ - case ::at::ScalarType::Long: \ - func(__VA_ARGS__); \ - break; \ - default: \ - throw std::runtime_error("Invalid scalar type"); \ - } - -#define HOST_NAME_MAX 256 -#else #define GENERATE_ALL_TYPES(type, func, args...) \ switch (type) { \ case ::at::ScalarType::Float: \ @@ -98,7 +62,6 @@ default: \ throw std::runtime_error("Invalid scalar type"); \ } -#endif namespace c10d { @@ -446,19 +409,12 @@ ProcessGroupGloo::Options::Options() namespace { -void socketInitialize() { -#ifdef _WIN32 - ::gloo::init_winsock(); -#endif -} - // Gloo assumes that this machine's hostname can always be resolved // to an address. If it doesn't it throws a runtime error saying // that it can't be resolved. Instead of catching it, we choose // to proactively check if an address can be resolved, so we can // gracefully fall back to an alternative if it doesn't. bool doesHostnameResolveToUsableAddress(const std::string& hostname) { - socketInitialize(); struct addrinfo hints; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; @@ -475,11 +431,7 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { continue; } rv = bind(fd, rp->ai_addr, rp->ai_addrlen); -#ifdef _WIN32 - closesocket(fd); -#else close(fd); -#endif if (rv == -1) { continue; } @@ -491,11 +443,14 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) { } // namespace +#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: - createDeviceForInterface(const std::string& interface_name) { - return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name); + createDeviceForInterface(const std::string& interface) { + return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface); } +#endif +#if defined(__linux__) || defined(__APPLE__) std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDeviceForHostname(const std::string& hostname) { TORCH_CHECK( @@ -505,14 +460,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: " to a (local) address"); return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname); } +#endif -#if defined(__linux__) || defined(_WIN32) +#ifdef __linux__ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: createDefaultDevice() { // Use the hostname to resolve the network address to // use. Note: if the hostname does not resolve to an address (e.g. // because of misconfigured /etc/hosts file), this will not work. - socketInitialize(); std::array hostname{}; auto rv = gethostname(hostname.data(), HOST_NAME_MAX); if (rv != 0) { diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp index 6c6e941ef95d..d975f6eb6bc5 100644 --- a/torch/lib/c10d/Utils.cpp +++ b/torch/lib/c10d/Utils.cpp @@ -1,6 +1,5 @@ #include -#ifndef _WIN32 #include #include @@ -355,6 +354,6 @@ std::tuple accept( return std::make_tuple( socket, sockaddrToString(reinterpret_cast(&addr))); } + } // namespace tcputil } // namespace c10d -#endif diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 1116cd39ba1c..1bdaddde9f24 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -1,8 +1,6 @@ #pragma once -#ifndef _WIN32 #include -#endif #include #include @@ -482,7 +480,6 @@ class ResourceGuard { bool released_; }; -#ifndef _WIN32 namespace tcputil { constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1); @@ -612,5 +609,4 @@ std::tuple accept( const std::chrono::milliseconds& timeout = kNoTimeout); } // namespace tcputil -#endif } // namespace c10d diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index 003f56f30861..8429d1099b29 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -8,19 +8,14 @@ function(c10d_add_test test_src) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) - target_link_libraries(${test_name} ${ARGN}) - if(NOT WIN32) - target_link_libraries(${test_name} pthread) - target_compile_options(${test_name} PRIVATE -Wno-error) - endif() + target_link_libraries(${test_name} pthread ${ARGN}) + target_compile_options(${test_name} PRIVATE -Wno-error) add_test(NAME ${test_name} COMMAND $) endfunction() c10d_add_test(FileStoreTest.cpp c10d gtest_main) -if(NOT WIN32) - c10d_add_test(HashStoreTest.cpp c10d gtest_main) - c10d_add_test(TCPStoreTest.cpp c10d gtest_main) -endif() +c10d_add_test(HashStoreTest.cpp c10d gtest_main) +c10d_add_test(TCPStoreTest.cpp c10d gtest_main) if(USE_CUDA) if(USE_C10D_GLOO) @@ -34,7 +29,7 @@ if(USE_CUDA) endif() else() if(USE_C10D_GLOO) - c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main) + c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main) endif() endif() diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp index 328da2faf648..defaff895a18 100644 --- a/torch/lib/c10d/test/CUDATest.hpp +++ b/torch/lib/c10d/test/CUDATest.hpp @@ -5,15 +5,9 @@ namespace c10d { namespace test { -#ifdef _WIN32 -#define EXPORT_TEST_API __declspec(dllexport) -#else -#define EXPORT_TEST_API -#endif +void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); -EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks); - -EXPORT_TEST_API int cudaNumDevices(); +int cudaNumDevices(); } // namespace test } // namespace c10d diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp index cc8da6326091..77215f4521c2 100644 --- a/torch/lib/c10d/test/FileStoreTest.cpp +++ b/torch/lib/c10d/test/FileStoreTest.cpp @@ -1,8 +1,6 @@ #include -#ifndef _WIN32 #include -#endif #include #include @@ -12,11 +10,6 @@ #include #include -#ifdef _WIN32 -std::string tmppath() { - return c10d::test::autoGenerateTmpFilePath(); -} -#else std::string tmppath() { const char* tmpdir = getenv("TMPDIR"); if (tmpdir == nullptr) { @@ -36,7 +29,6 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } -#endif void testGetSet(std::string path, std::string prefix = "") { // Basic Set/Get on File Store diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index da4f9b5fc106..6606e553e733 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -1,10 +1,7 @@ -#ifndef _WIN32 #include +#include #include #include -#endif - -#include #include #include @@ -24,7 +21,6 @@ using namespace c10d::test; constexpr auto kSendDelay = std::chrono::milliseconds(100); constexpr auto kWaitTimeout = std::chrono::milliseconds(1); -#ifndef _WIN32 class SignalTest { public: SignalTest(const std::string& path) : path_(path) {} @@ -96,7 +92,6 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( test.arm(fork.pid, signal); return test.run(0, 2); } -#endif class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo { public: @@ -461,7 +456,6 @@ void testRecv(const std::string& path) { EXPECT_TRUE(recvCompleted); } -#ifndef _WIN32 TEST(ProcessGroupGlooTest, testSIGSTOPException) { // test SIGSTOP // Fork() and TSAN don't play well together, so skip the test if we're testing @@ -491,7 +485,6 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) { EXPECT_FALSE(work->isSuccess()); EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception); } -#endif TEST(ProcessGroupGlooTest, testAllReduceCPU) { { diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp index 5f5dfca315cb..c62695485573 100644 --- a/torch/lib/c10d/test/TestUtils.hpp +++ b/torch/lib/c10d/test/TestUtils.hpp @@ -1,12 +1,9 @@ #pragma once -#ifndef _WIN32 #include +#include #include #include -#endif - -#include #include #include @@ -40,28 +37,6 @@ class Semaphore { std::condition_variable cv_; }; -#ifdef _WIN32 -std::string autoGenerateTmpFilePath() { - char tmp[L_tmpnam_s]; - errno_t err; - err = tmpnam_s(tmp, L_tmpnam_s); - if (err != 0) - { - throw std::system_error(errno, std::system_category()); - } - return std::string(tmp); -} - -std::string tmppath() { - const char* tmpfile = getenv("TMPFILE"); - if (tmpfile) { - return std::string(tmpfile); - } - else { - return autoGenerateTmpFilePath(); - } -} -#else std::string tmppath() { // TMPFILE is for manual test execution during which the user will specify // the full temp file path using the environmental variable TMPFILE @@ -88,7 +63,6 @@ std::string tmppath() { close(fd); return std::string(tmp.data(), tmp.size()); } -#endif bool isTSANEnabled() { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); @@ -106,7 +80,6 @@ struct TemporaryFile { } }; -#ifndef _WIN32 struct Fork { pid_t pid; @@ -128,7 +101,6 @@ struct Fork { return pid == 0; } }; -#endif } // namespace test } // namespace c10d diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index b2cd30c66812..f8e5b4822bd8 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -16,7 +16,7 @@ import torch.distributed as c10d from functools import partial, reduce -from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA +from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM class TestSkip(NamedTuple): exit_code: int @@ -143,23 +143,10 @@ def wrapper(*args, **kwargs): return wrapper -def skip_if_win32(): - return unittest.skipIf( - sys.platform == 'win32', - "This unit test case is not supportted on Windows platform", - ) - TIMEOUT_DEFAULT = 100 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400} -def create_device(interface=None): - if sys.platform == 'win32' or interface is None: - return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1") - else: - return c10d.ProcessGroupGloo.create_device(interface=interface) - - def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) @@ -219,7 +206,7 @@ def initialize_temp_directories(init_method=None): if init_method is not None: os.environ["INIT_METHOD"] = init_method else: - os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join( + os.environ["INIT_METHOD"] = "file://" + os.path.join( init_dir_path, "shared_init_file" ) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 36434ff8aa2f..9959551031ff 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -53,10 +53,6 @@ torch.backends.disable_global_flags() -FILE_SCHEMA = "file://" -if sys.platform == 'win32': - FILE_SCHEMA = "file:///" - IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle' class ProfilingMode(Enum): diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py index 93de304a53ca..b88765211df1 100644 --- a/torch/testing/_internal/dist_utils.py +++ b/torch/testing/_internal/dist_utils.py @@ -7,7 +7,6 @@ import torch.distributed as dist import torch.distributed.rpc as rpc from torch.distributed.rpc import _rref_context_get_debug_info # type: ignore[attr-defined] -from torch.testing._internal.common_utils import FILE_SCHEMA if not dist.is_available(): @@ -15,7 +14,7 @@ sys.exit(0) -INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}" +INIT_METHOD_TEMPLATE = "file://{file_name}" def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True, diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py index 09db831e9999..1b1f755ed4cc 100644 --- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py +++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py @@ -20,7 +20,7 @@ skip_if_lt_x_gpu, skip_if_rocm, ) -from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE +from torch.testing._internal.dist_utils import dist_init from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import ( RpcAgentTestFixture, ) @@ -329,7 +329,7 @@ def _remote_worker_process(self): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -346,7 +346,7 @@ def _trainer_process(self, rank: int): ) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool): gLogger.info("Running the master process...") dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) @@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self): torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", - init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), + init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index af5e648f6acb..f6f2b9a6fbfb 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1,4 +1,5 @@ import copy +import fcntl import itertools import random import math @@ -21,7 +22,6 @@ import torch.nn as nn import torch.nn.functional as F from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember -from torch.testing._internal.common_utils import FILE_SCHEMA from torch.testing._internal.common_distributed import ( MultiProcessTestCase, TEST_SKIPS, @@ -43,10 +43,6 @@ except ImportError: HAS_TORCHVISION = False -if sys.platform == 'win32': - import msvcrt -else: - import fcntl class Foo: def __init__(self, x): @@ -195,17 +191,10 @@ def _lock(): lockfile = os.path.join(TEMP_DIR, "lockfile") with open(lockfile, "w") as lf: try: - if sys.platform == 'win32': - msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1) - yield - else: - fcntl.flock(lf.fileno(), fcntl.LOCK_EX) - yield + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield finally: - if sys.platform == 'win32': - msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) - else: - fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) lf.close() @@ -281,7 +270,7 @@ def tearDown(self): @property def init_method(self): - return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name) + return "file://{file_name}".format(file_name=self.file_name) @classmethod def _run(cls, rank, test_name, file_name): @@ -2173,13 +2162,8 @@ def _test_DDP_5iter( # save the model in the middle and reload if test_save and idx == 2 and INIT_METHOD.startswith("file://"): with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) with tempfile.TemporaryFile() as tmp_file: torch.save(model_DDP, tmp_file) @@ -2208,13 +2192,8 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # dummy data initialization local_bs = len(gpu_subset) @@ -2371,13 +2350,8 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs # test serializable/unserializable with tempfile.NamedTemporaryFile() as tmp: - if sys.platform == 'win32': - torch.save(model_DDP, tmp) - tmp.seek(0) - model_DDP = torch.load(tmp) - else: - torch.save(model_DDP, tmp.name) - model_DDP = torch.load(tmp.name) + torch.save(model_DDP, tmp.name) + model_DDP = torch.load(tmp.name) # data initialization input_cpu = torch.randn(global_bs, 2) From bdf329ef8a256f2157aae86a5be28109c2589eb4 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Thu, 24 Sep 2020 22:49:17 -0700 Subject: [PATCH 010/292] SyncBN: preserve qconfig if it exists (#45317) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45317 Eager mode quantization depends on the presence of the `config` model attribute. Currently converting a model to use `SyncBatchNorm` removes the qconfig - fixing this. This is important if a BN is not fused to anything during quantization convert. Test Plan: ``` python test/test_quantization.py TestDistributed.test_syncbn_preserves_qconfig ``` Imported from OSS Reviewed By: jerryzh168 Differential Revision: D23922072 fbshipit-source-id: cc1bc25c8e5243abb924c6889f78cf65a81be158 --- test/quantization/test_workflow_module.py | 15 +++++++++++++++ torch/nn/modules/batchnorm.py | 14 ++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 817e54460e07..5068a6fe7fd4 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -1536,6 +1536,21 @@ def forward(self, x): isinstance(fused_model.conv.bn, nn.SyncBatchNorm), "Expected BN to be converted to SyncBN") + def test_syncbn_preserves_qconfig(self): + """ + Makes sure that if a BatchNorm is not fused and a qconfig exists, + convering the module to SyncBatchNorm preserves the qconfig. + """ + m = nn.Sequential( + nn.Conv2d(1, 1, 1), + nn.BatchNorm2d(1), + ) + m[1].qconfig = torch.quantization.default_qconfig + m = torch.nn.SyncBatchNorm.convert_sync_batchnorm(m) + self.assertTrue( + hasattr(m[1], "qconfig"), + "missing qconfig after SyncBatchNorm conversion") + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @override_qengines diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py index 075311870439..f5ca6deb5b19 100644 --- a/torch/nn/modules/batchnorm.py +++ b/torch/nn/modules/batchnorm.py @@ -114,7 +114,7 @@ def forward(self, input: Tensor) -> Tensor: else: # use exponential moving average exponential_average_factor = self.momentum - r""" + r""" Decide whether the mini-batch stats should be used for normalization rather than the buffers. Mini-batch stats are used in training mode, and in eval mode when buffers are None. """ @@ -185,7 +185,7 @@ class BatchNorm1d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -258,7 +258,7 @@ class BatchNorm2d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -332,7 +332,7 @@ class BatchNorm3d(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` @@ -414,7 +414,7 @@ class SyncBatchNorm(_BatchNorm): track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics, and initializes statistics - buffers :attr:`running_mean` and :attr:`running_var` as ``None``. + buffers :attr:`running_mean` and :attr:`running_var` as ``None``. When these buffers are ``None``, this module always uses batch statistics. in both training and eval modes. Default: ``True`` process_group: synchronization of stats happen within each process group @@ -493,7 +493,7 @@ def forward(self, input: Tensor) -> Tensor: else: # use exponential moving average exponential_average_factor = self.momentum - r""" + r""" Decide whether the mini-batch stats should be used for normalization rather than the buffers. Mini-batch stats are used in training mode, and in eval mode when buffers are None. """ @@ -576,6 +576,8 @@ def convert_sync_batchnorm(cls, module, process_group=None): module_output.running_mean = module.running_mean module_output.running_var = module.running_var module_output.num_batches_tracked = module.num_batches_tracked + if hasattr(module, "qconfig"): + module_output.qconfig = module.qconfig for name, child in module.named_children(): module_output.add_module(name, cls.convert_sync_batchnorm(child, process_group)) del module From 95df8657c94492ff026112f8e51a24216f1a9a0c Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 24 Sep 2020 23:07:38 -0700 Subject: [PATCH 011/292] Enables test linalg (#45278) Summary: Fixes https://github.com/pytorch/pytorch/issues/45271. Pull Request resolved: https://github.com/pytorch/pytorch/pull/45278 Reviewed By: ngimel Differential Revision: D23926124 Pulled By: mruberry fbshipit-source-id: 26692597f9a1988e5fa846f97b8430c3689cac27 --- test/run_test.py | 1 + test/test_linalg.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index d63fc372f9c2..b24a20c60f46 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -41,6 +41,7 @@ 'test_foreach', 'test_indexing', 'test_jit', + 'test_linalg', 'test_logging', 'test_mkldnn', 'test_multiprocessing', diff --git a/test/test_linalg.py b/test/test_linalg.py index c81b4dc37582..3dbf31497b77 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -5,7 +5,7 @@ from math import inf, nan, isnan from torch.testing._internal.common_utils import \ - (TestCase, run_tests, TEST_NUMPY) + (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack) from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args @@ -56,11 +56,12 @@ def test_det(self, device, dtype): # NOTE: det requires a 2D+ tensor t = torch.randn(1, device=device, dtype=dtype) - with self.assertRaises(IndexError): + with self.assertRaises(RuntimeError): op(t) # This test confirms that torch.linalg.norm's dtype argument works # as expected, according to the function's documentation + @skipCUDAIfNoMagma def test_norm_dtype(self, device): def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype): msg = ( @@ -154,6 +155,7 @@ def run_test_case(input, p, dim, keepdim): # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that # their matrix norm results match + @skipCUDAIfNoMagma @unittest.skipIf(not TEST_NUMPY, "NumPy not found") @dtypes(torch.float, torch.double) def test_norm_matrix(self, device, dtype): @@ -400,6 +402,8 @@ def gen_error_message(input_size, ord, keepdim, dim=None): # Test that linal.norm gives the same result as numpy when inputs # contain extreme values (inf, -inf, nan) + @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!") + @unittest.skipIf(IS_MACOS, "Skipped on MacOS!") @skipCUDAIfNoMagma @skipCPUIfNoLapack @unittest.skipIf(not TEST_NUMPY, "Numpy not found") @@ -440,14 +444,14 @@ def is_broken_matrix_norm_case(ord, x): result_n = np.linalg.norm(x_n, ord=ord) if is_broken_matrix_norm_case(ord, x): - self.assertNotEqual(result, result_n, msg=msg) + continue else: self.assertEqual(result, result_n, msg=msg) # Test degenerate shape results match numpy for linalg.norm vector norms @skipCUDAIfNoMagma @skipCPUIfNoLapack - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + @unittest.skipIf(TEST_WITH_ASAN, "Skipped on ASAN since it checks for undefined behavior.") @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_norm_vector_degenerate_shapes(self, device, dtype): def run_test_case(input, ord, dim, keepdim, should_error): From 99e0a87bbb4faa6bb539c0eedf323d79fdd8cfcf Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Thu, 24 Sep 2020 23:11:38 -0700 Subject: [PATCH 012/292] [nvFuser] Latency improvements for pointwise + reduction fusion (#45218) Summary: A lot of changes are in this update, some highlights: - Added Doxygen config file - Split the fusion IR (higher level TE like IR) from kernel IR (lower level CUDA like IR) - Improved latency with dynamic shape handling for the fusion logic - Prevent recompilation for pointwise + reduction fusions when not needed - Improvements to inner dimension reduction performance - Added input -> kernel + kernel launch parameters cache, added eviction policy - Added reduction fusions with multiple outputs (still single reduction stage) - Fixed code generation bugs for symbolic tiled GEMM example - Added thread predicates to prevent shared memory form being loaded multiple times - Improved sync threads placements with shared memory and removed read before write race - Fixes to FP16 reduction fusions where output would come back as FP32 Pull Request resolved: https://github.com/pytorch/pytorch/pull/45218 Reviewed By: ezyang Differential Revision: D23905183 Pulled By: soumith fbshipit-source-id: 12f5ad4cbe03e9a25043bccb89e372f8579e2a79 --- aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | 1 + caffe2/CMakeLists.txt | 4 + test/cpp/jit/test_gpu.cpp | 1925 ++++++++++--- test/cpp/jit/tests.h | 21 +- test/test_jit_cuda_fuser.py | 99 +- test/test_jit_cuda_fuser_legacy.py | 6 + test/test_jit_cuda_fuser_profiling.py | 6 + tools/build_variables.bzl | 4 + torch/csrc/jit/codegen/cuda/codegen.cpp | 640 +++++ torch/csrc/jit/codegen/cuda/codegen.h | 22 + torch/csrc/jit/codegen/cuda/compute_at.cpp | 65 +- torch/csrc/jit/codegen/cuda/compute_at.h | 4 +- torch/csrc/jit/codegen/cuda/docs/.gitignore | 1 + .../jit/codegen/cuda/docs/documentation.h | 23 + .../csrc/jit/codegen/cuda/docs/fuser.doxygen | 2515 +++++++++++++++++ .../cuda/docs/images/ir_architecture.png | Bin 0 -> 96754 bytes torch/csrc/jit/codegen/cuda/docs/main_page.md | 8 + torch/csrc/jit/codegen/cuda/executor.cpp | 395 ++- torch/csrc/jit/codegen/cuda/executor.h | 86 +- .../jit/codegen/cuda/executor_kernel_arg.cpp | 2 +- .../jit/codegen/cuda/executor_kernel_arg.h | 8 + .../jit/codegen/cuda/executor_launch_params.h | 5 + .../csrc/jit/codegen/cuda/executor_utils.cpp | 217 +- torch/csrc/jit/codegen/cuda/executor_utils.h | 17 +- .../csrc/jit/codegen/cuda/expr_evaluator.cpp | 219 +- torch/csrc/jit/codegen/cuda/expr_evaluator.h | 86 +- torch/csrc/jit/codegen/cuda/fusion.cpp | 153 +- torch/csrc/jit/codegen/cuda/fusion.h | 56 +- torch/csrc/jit/codegen/cuda/graph_fuser.cpp | 7 + torch/csrc/jit/codegen/cuda/index_compute.cpp | 186 +- .../csrc/jit/codegen/cuda/instrumentation.cpp | 71 + torch/csrc/jit/codegen/cuda/instrumentation.h | 93 + torch/csrc/jit/codegen/cuda/interface.cpp | 1 + torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp | 22 +- torch/csrc/jit/codegen/cuda/ir_base_nodes.h | 1 + torch/csrc/jit/codegen/cuda/ir_cloner.cpp | 76 - torch/csrc/jit/codegen/cuda/ir_cloner.h | 23 - torch/csrc/jit/codegen/cuda/ir_graphviz.cpp | 50 - torch/csrc/jit/codegen/cuda/ir_graphviz.h | 7 - .../jit/codegen/cuda/ir_interface_nodes.h | 9 +- .../csrc/jit/codegen/cuda/ir_internal_nodes.h | 12 + torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 840 +----- torch/csrc/jit/codegen/cuda/ir_iostream.h | 108 +- torch/csrc/jit/codegen/cuda/ir_nodes.cpp | 377 ++- torch/csrc/jit/codegen/cuda/ir_printer.h | 54 +- torch/csrc/jit/codegen/cuda/iter_visitor.cpp | 104 +- torch/csrc/jit/codegen/cuda/iter_visitor.h | 71 +- torch/csrc/jit/codegen/cuda/kernel.cpp | 147 +- torch/csrc/jit/codegen/cuda/kernel.h | 121 +- torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 384 ++- torch/csrc/jit/codegen/cuda/kernel_cache.h | 111 +- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 315 +-- torch/csrc/jit/codegen/cuda/kernel_ir.h | 215 +- .../jit/codegen/cuda/kernel_ir_builder.cpp | 104 + .../csrc/jit/codegen/cuda/kernel_ir_builder.h | 81 + .../codegen/cuda/kernel_resource_strings.h | 77 +- torch/csrc/jit/codegen/cuda/lower2device.cpp | 191 +- torch/csrc/jit/codegen/cuda/lower2device.h | 41 +- torch/csrc/jit/codegen/cuda/lower_index.cpp | 137 +- torch/csrc/jit/codegen/cuda/lower_index.h | 8 + .../jit/codegen/cuda/lower_insert_syncs.cpp | 227 ++ .../jit/codegen/cuda/lower_insert_syncs.h | 51 + torch/csrc/jit/codegen/cuda/lower_loops.cpp | 157 +- torch/csrc/jit/codegen/cuda/lower_loops.h | 69 +- .../codegen/cuda/lower_thread_predicate.cpp | 68 +- .../jit/codegen/cuda/lower_thread_predicate.h | 32 +- torch/csrc/jit/codegen/cuda/lower_unroll.cpp | 20 +- torch/csrc/jit/codegen/cuda/lower_unroll.h | 8 +- torch/csrc/jit/codegen/cuda/lower_utils.cpp | 52 +- .../jit/codegen/cuda/lower_validation.cpp | 5 +- torch/csrc/jit/codegen/cuda/manager.cpp | 62 +- torch/csrc/jit/codegen/cuda/parser.cpp | 42 +- torch/csrc/jit/codegen/cuda/partition.cpp | 5 + .../jit/codegen/cuda/predicate_compute.cpp | 76 +- .../csrc/jit/codegen/cuda/predicate_compute.h | 8 +- torch/csrc/jit/codegen/cuda/scheduler.cpp | 483 ++-- torch/csrc/jit/codegen/cuda/scheduler.h | 32 +- .../csrc/jit/codegen/cuda/shape_inference.cpp | 12 +- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 21 +- torch/csrc/jit/codegen/cuda/transform_iter.h | 2 + .../jit/codegen/cuda/transform_replay.cpp | 7 + .../jit/codegen/cuda/transform_rfactor.cpp | 5 + torch/csrc/jit/codegen/cuda/type.h | 8 + torch/csrc/jit/codegen/cuda/utils.h | 15 + 84 files changed, 8911 insertions(+), 3188 deletions(-) create mode 100644 torch/csrc/jit/codegen/cuda/codegen.cpp create mode 100644 torch/csrc/jit/codegen/cuda/codegen.h create mode 100644 torch/csrc/jit/codegen/cuda/docs/.gitignore create mode 100644 torch/csrc/jit/codegen/cuda/docs/documentation.h create mode 100644 torch/csrc/jit/codegen/cuda/docs/fuser.doxygen create mode 100644 torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png create mode 100644 torch/csrc/jit/codegen/cuda/docs/main_page.md create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.cpp create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.h create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.h diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 4630465115c7..00e57ca63520 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -42,6 +42,7 @@ namespace at { namespace cuda { _(nvrtcGetProgramLog) \ _(nvrtcGetLoweredName) \ _(cuModuleLoadData) \ + _(cuModuleLoadDataEx) \ _(cuModuleGetFunction) \ _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \ _(cuGetErrorString) \ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 65f072b6f29d..6ea848bd32e5 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -506,6 +506,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp @@ -515,6 +516,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/instrumentation.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -524,7 +526,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir_builder.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_insert_syncs.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 80fa318d653a..d18becfa6641 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -73,11 +75,11 @@ TensorView* makeTensorWithContig( } void checkIntValue( - const EvaluationContext* eval_context, + StatefulExpressionEvaluator& evaluator, Val* val, Int::ScalarType expected_value) { TORCH_CHECK(val->isAnInt()); - const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context); + const auto actual_value = evaluator.inferValue(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } @@ -162,16 +164,16 @@ void testGPU_FusionExprEvalConstants() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(7); auto* b = new Int(3); - checkIntValue(&eval_context, neg(a), -7); - checkIntValue(&eval_context, add(a, b), 10); - checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); + checkIntValue(evaluator, neg(a), -7); + checkIntValue(evaluator, add(a, b), 10); + checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); } // Evaluate basic scalar operations with bound values @@ -179,7 +181,7 @@ void testGPU_FusionExprEvalBindings() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(); auto* b = new Int(); @@ -188,35 +190,35 @@ void testGPU_FusionExprEvalBindings() { auto* e = new Int(0); // trying to evaluate before binding should give empty results - TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value()); - TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value()); + TORCH_CHECK(!evaluator.inferValue(a).has_value()); + TORCH_CHECK(!evaluator.inferValue(d).has_value()); - eval_context.bind(a, 7); - eval_context.bind(b, 3); + evaluator.safeBind(a, 7); + evaluator.safeBind(b, 3); // can't bind to the results of expressions - ASSERT_ANY_THROW(eval_context.bind(c, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(c, 100)); // can't bind to concrete values - ASSERT_ANY_THROW(eval_context.bind(e, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(e, 100)); - checkIntValue(&eval_context, c, 10); - checkIntValue(&eval_context, sub(a, b), 4); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); - checkIntValue(&eval_context, d, -4); + checkIntValue(evaluator, c, 10); + checkIntValue(evaluator, sub(a, b), 4); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); + checkIntValue(evaluator, d, -4); // Reset evaluation context - eval_context = EvaluationContext(&fusion); + evaluator = StatefulExpressionEvaluator(&fusion); - eval_context.bind(a, 2); - eval_context.bind(b, 5); + evaluator.safeBind(a, 2); + evaluator.safeBind(b, 5); - checkIntValue(&eval_context, c, 7); - checkIntValue(&eval_context, sub(a, b), -3); - checkIntValue(&eval_context, mod(a, b), 2); - checkIntValue(&eval_context, ceilDiv(a, b), 1); - checkIntValue(&eval_context, d, -2); + checkIntValue(evaluator, c, 7); + checkIntValue(evaluator, sub(a, b), -3); + checkIntValue(evaluator, mod(a, b), 2); + checkIntValue(evaluator, ceilDiv(a, b), 1); + checkIntValue(evaluator, d, -2); } // Evaluate expressions in a simple IR @@ -247,8 +249,8 @@ void testGPU_FusionExprEvalBasic() { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values // @@ -258,21 +260,21 @@ void testGPU_FusionExprEvalBasic() { // (ex. `tv0->getRootDomain()[0]->extent()` // instead of `tv0->axis(0)->extent()`) // - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); } // Evaluate expressions in a more complex IR @@ -298,33 +300,33 @@ void testGPU_FusionExprEvalComplex() { tv6->split(0, 5); tv5->merge(0); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 129); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 127); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127); // Evaluate and check extent values TORCH_CHECK(tv0->domain()->nDims() == 2); - checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127); TORCH_CHECK(tv3->domain()->nDims() == 2); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127); TORCH_CHECK(tv4->domain()->nDims() == 2); - checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127); TORCH_CHECK(tv5->domain()->nDims() == 1); - checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383); + checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383); TORCH_CHECK(tv6->domain()->nDims() == 3); - checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26); - checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5); - checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127); + checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26); + checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5); + checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127); } // Evaluate expressions post lowering @@ -360,31 +362,29 @@ void testGPU_FusionExprEvalPostLower() { // Lower GpuLower gpulw(&fusion); - std::stringstream kernel; - gpulw.printKernel(kernel); // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); - checkIntValue(&eval_context, bid_x, 2); - checkIntValue(&eval_context, tid_x, 128); + checkIntValue(evaluator, bid_x, 2); + checkIntValue(evaluator, tid_x, 128); } void testGPU_FusionClear() { @@ -505,10 +505,12 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion - std::stringstream original_kernel; + std::string original_kernel; { - GpuLower lower(&original_fusion); - lower.printKernel(original_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&original_fusion); + original_kernel = + codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated @@ -529,12 +531,14 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels - std::stringstream clone_kernel; + std::string clone_kernel; { - GpuLower lower(&before_lowering); - lower.printKernel(clone_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&before_lowering); + clone_kernel = + codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } - ASSERT_EQ(original_kernel.str(), clone_kernel.str()); + ASSERT_EQ(original_kernel, clone_kernel); } void testGPU_FusionMove() { @@ -593,9 +597,7 @@ void testGPU_FusionMove() { ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR - std::stringstream kernel; GpuLower lower(&another_fusion); - lower.printKernel(kernel); std::stringstream lowered_ir; lowered_ir << another_fusion; @@ -799,48 +801,6 @@ void testGPU_FusionTensor() { } } - { - auto tensor = at::randn({2, 1, 4}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - } - - { - auto tensor = at::randn({2, 3, 1}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]); - } - // TensorType::create fills stride_properties, which helps us to mark // IterDomain properly // Note: implementation could change, depending on how much we want to invest @@ -1156,43 +1116,36 @@ void testGPU_FusionParser() { // 1. this can be moved to a dedicated "golden" file // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3){ - float T2[4]; - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; +__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { + float T2[1]; + if ((((((blockIdx.x * 1) + (1 - 1)) * 128) + threadIdx.x) < T0.size[0])) { + for(size_t i6 = 0; i6 < 1; ++i6) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } else { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; + for(size_t i6 = 0; i6 < 1; ++i6) { + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } - } - } - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; - } - } else { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } } } )"; - std::string actual_kernel = GpuLower(fusion.get()).getKernel(); - actual_kernel = "\n" + actual_kernel; + const std::string actual_kernel = + "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr @@ -1576,11 +1529,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t5)); TORCH_CHECK(at::allclose(outputs[1], t6)); } @@ -1636,11 +1585,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {kernel_tv3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); } // Case 4 @@ -1706,11 +1651,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1, t2, t3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t6)); } // Case 5 @@ -1752,176 +1693,715 @@ void testGPU_FusionAdvancedComputeAt() { } } -void testGPU_FusionScalarInputs() { +void testGPU_FusionComputeAtMultiConsumers() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 Fusion fusion; FusionGuard fg(&fusion); - TensorView* tv0 = makeDummyTensor(2); + TensorView* tv0 = makeDummyTensor(1); fusion.addInput(tv0); - TensorView* tv1 = makeDummyTensor(2); - fusion.addInput(tv1); - Float* f0 = new Float(); - fusion.addInput(f0); - Float* f1 = new Float(); - fusion.addInput(f1); - Float* f2 = new Float(); - fusion.addInput(f2); - Float* f3 = new Float(); - fusion.addInput(f3); - Val* f4 = mul(f0, f1); - Val* f5 = sub(f2, f3); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + fusion.addOutput(tv2); + fusion.addOutput(tv3); - TensorView* tv2 = sub(tv1, f4); - TensorView* tv3 = add(tv0, f5); - TensorView* tv4 = mul(tv3, tv2); + // This computeAt will affect tv2 as well, even though tv2 is not in + // the data-flow path between tv1 and tv3. The reason is that tv1 is + // now computed at tv3, so tv2 must also be computed at the same + // location. Overall, what will happen is basically we merge + // expressions of all tensors and compute them in a single loop + // nest. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + // Note that tv2 is also computed at tv3. + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + TORCH_CHECK(!tv3->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + + at::Tensor kernel_tv2 = at::empty_like(t0, options); + at::Tensor kernel_tv3 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv2, kernel_tv3}); + + TORCH_CHECK(at::allclose(kernel_tv2, t2)); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); +} +// Similar to ComputeAtMultiConsumers, but with a common consumer. +void testGPU_FusionComputeAtCommonConsumer1() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + fusion.addOutput(tv3); fusion.addOutput(tv4); + fusion.addOutput(tv5); - // Lets setup to actually run - while (tv4->nDims() > 1) - tv4->merge(0); - tv4->split(0, 128); - tv4->split(0, 4); + // Computing tv1 at tv3. This will affect tv2 as discussed in + // ComplexComputeAt1. Additionally, in this case, notice that tv4 is + // the common consumer of tv2 and tv3, so they are computed at + // tv4. The indirect propagation of the computeAt should stop at the + // common consumer, and no further change should occur. More + // specifically, tv4 and tv5 should not have a computeAt tensor. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(!tv4->hasComputeAt()); + TORCH_CHECK(!tv5->hasComputeAt()); - tv4->axis(0)->parallelize(ParallelType::BIDx); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); +} + +void testGPU_FusionComputeAtCommonConsumer2() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv3 + tv4 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + + fusion.addOutput(tv5); + + TensorView* computeAtTarget = tv3; + + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + // This computeAt will affect all tensors including tv3, tv4 and + // tv5, even though it appears to impact only tv1 and tv2. The + // reason is that tv1 is now computed at tv3, so tv4 must also be + // computed at the same location. Similarly, the consumer of tv4, + // tv5, must also be computed at the same location. Overall, what + // will happen is basically we merge expressions of all tensors and + // compute them in a single loop nest. Internally, this will be + // realized by making all tensors, except for those in the path + // between tv1 and tv3, computed at tv5, which we call the common + // consumer. + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(!tv5->hasComputeAt()); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - + TensorView* tv = val->as(); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } - // f4 = f0 * f1 - // f5 = f2 - f3 - // t2 = t1 - f4 - // t3 = t0 + f5 - // t4 = t3 * t2 - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - float fl0 = 0.1; - float fl1 = -0.2; - float fl2 = 0.3; - float fl3 = -0.4; - float fl4 = fl0 * fl1; - float fl5 = fl2 - fl3; - at::Tensor t0 = at::randn({129, 127}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.sub(fl4); - auto t3 = t0.add(fl5); - auto t4 = t3.mul(t2); - at::Tensor kernel_tv4 = at::empty_like(t0, options); + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; - at::Scalar test(fl0); + at::Tensor kernel_tv5 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - fe.runFusion( - {t0, - t1, - at::Scalar(fl0), - at::Scalar(fl1), - at::Scalar(fl2), - at::Scalar(fl3)}, - {kernel_tv4}); - - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); + fe.runFusion({t0}, {kernel_tv5}); - TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); } -void testGPU_FusionLoopUnroll() { +// Similar to the above common consumer test but adds an additional +// tensor that has no common consumer with the other tensors. +void testGPU_FusionComputeAtCommonConsumer3() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv2 + tv3 + // tv6 = tv1 + 6 Fusion fusion; FusionGuard fg(&fusion); - // Set up your input tensor views - TensorView* tv0 = makeDummyTensor(3); - TensorView* tv1 = makeDummyTensor(3); - - // Register your inputs + TensorView* tv0 = makeDummyTensor(2); fusion.addInput(tv0); - fusion.addInput(tv1); - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Float(2.0)); - TensorView* tv3 = add(tv0, tv2); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + TensorView* tv6 = add(tv1, new Float(6.0)); - // Register your outputs - fusion.addOutput(tv3); + fusion.addOutput(tv5); + fusion.addOutput(tv6); - int block_size = 16; + TensorView* computeAtTarget = tv3; - tv3->merge(0, 1); - tv3->merge(0, 1); + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); - tv3->split(0, block_size); - tv3->split(0, 4); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); + // This will have the same impact on the tensors except for tv5 and + // tv6. tv6 does not have any common consumer with the computeAt + // target, but since it uses tv1, it must be also computed at the + // same location as the other impacted tensors. We can either make + // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 + // should be computed at tv6 just because the current implementation + // orders the computeAt relationship based on the order in which + // tensors are specified as outputs. - // Parallelize - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + + // tv5 should be computed at tv6 since tv5 is added as an output + // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be + // computed at tv5. + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = val->as(); + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::rand({129, 13, 3}, options); - at::Tensor input1 = at::rand({129, 13, 3}, options); + at::Tensor t0 = at::randn({129, 127}, options); + + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; + auto t6 = t1.add({6.0}); + + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input0, input1}); + fe.runFusion({t0}, {kernel_tv5, kernel_tv6}); - TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); } -/* - * Helper function for single op testing that generates a codegen operand - */ +// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor +// that does not have data dependency with the consumer. +void testGPU_FusionComputeAtNoCommonConsumer() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv1 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + // tv6 = tv1 * 6 + Fusion fusion; + FusionGuard fg(&fusion); -Val* gen_jit_operand(std::pair desc) { - if (desc.first == ValType::TensorView) { - return makeDummyTensor(2, desc.second); - } else if (desc.first == ValType::Scalar) { - if (desc.second == DataType::Float) - return new Float(); - else if (desc.second == DataType::Int) - return new Int(); - else - TORCH_CHECK("Not currently supported type", desc.first); - } else { - TORCH_CHECK("Not currently supported type", desc.first); + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + // Notice that tv6 is not a consumer of tv4. + TensorView* tv6 = mul(tv1, new Float(6.0)); + fusion.addOutput(tv3); + fusion.addOutput(tv4); + fusion.addOutput(tv5); + fusion.addOutput(tv6); + + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); } - return nullptr; -} -/* - * Helper function for single op testing that generates an ATen operand - */ + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); -IValue gen_aten_operand( - std::pair desc, - int blocks, + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + auto t6 = t1 * 6.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); +} + +namespace { + +void checkConcretized( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_concretize) { + if (should_concretize) { + TORCH_CHECK( + IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } else { + TORCH_CHECK( + !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } +} + +} // namespace + +void testGPU_FusionBCastConcretizeBasic() { + Fusion fusion; + FusionGuard fg(&fusion); + + // tv0: [I I] + TensorView* tv0 = makeDummyTensor(2); + + // tv1: [I I I] + TensorView* tv1 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + + // tv2*: [B I I] + auto tv2_0 = broadcast(tv0, {true, false, false}); + auto tv2_1 = broadcast(tv0, {true, false, false}); + auto tv2 = add(tv2_0, tv2_1); + + // tv3: [I I I] + auto tv3 = add(tv2, tv1); + + fusion.addOutput(tv3); + + checkConcretized(tv2, 0, tv1, 0, true); + checkConcretized(tv2_0, 0, tv1, 0, true); + checkConcretized(tv2_1, 0, tv1, 0, true); + checkConcretized(tv2_0, 1, tv1, 0, false); + checkConcretized(tv2_0, 0, tv1, 1, false); +} + +void testGPU_FusionBCastConcretizeRfactor() { + Fusion fusion; + FusionGuard fg(&fusion); + + // both tv0 and tv1 = [I, I] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + //[B,I,I] + auto tv2 = broadcast(tv1, {true, false, false}); + + //[B,I,R] + auto tv3 = sum(tv2, {2}); + + auto tv5 = add(tv3, tv1); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv3->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv4 = tv3->rFactor({3}); + + checkConcretized(tv2, 0, tv5, 0, true); + checkConcretized(tv4, 0, tv5, 0, true); + checkConcretized(tv3, 0, tv5, 0, true); +} + +namespace { + +void checkIdProvedEquivalent( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_prove) { + if (should_prove) { + TORCH_CHECK(IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } else { + TORCH_CHECK(!IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } +} + +} // namespace + +void testGPU_FusionProveIdEqBasic() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv3 = broadcast(tv0, {true, false, false}); + auto tv4 = broadcast(tv1, {false, true, false}); + auto tv5 = add(tv3, tv4); + fusion.addOutput(tv5); + + checkIdProvedEquivalent(tv0, 0, tv4, 1, true); + checkIdProvedEquivalent(tv1, 0, tv4, 0, true); + checkIdProvedEquivalent(tv1, 1, tv0, 1, true); + checkIdProvedEquivalent(tv0, 0, tv5, 1, true); + checkIdProvedEquivalent(tv1, 1, tv5, 2, true); + checkIdProvedEquivalent(tv0, 0, tv1, 0, false); + checkIdProvedEquivalent(tv0, 1, tv1, 0, false); + checkIdProvedEquivalent(tv0, 0, tv1, 1, false); +} + +void testGPU_FusionProveIdEqRfactor() { + Fusion fusion; + FusionGuard fg(&fusion); + + // [I,I] + TensorView* tv0 = makeDummyTensor(2); + // [I,I,I] + TensorView* tv1 = makeDummyTensor(3); + + //[I,I,R] + auto tv2 = sum(tv1, {2}); + + auto tv5 = add(tv2, tv0); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv2->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv3 = tv2->rFactor({3}); + + checkIdProvedEquivalent(tv1, 0, tv0, 0, true); + checkIdProvedEquivalent(tv2, 0, tv0, 0, true); + checkIdProvedEquivalent(tv3, 0, tv0, 0, true); +} + +void testGPU_FusionScalarInputs() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + TensorView* tv1 = makeDummyTensor(2); + fusion.addInput(tv1); + + Float* f0 = new Float(); + fusion.addInput(f0); + Float* f1 = new Float(); + fusion.addInput(f1); + Float* f2 = new Float(); + fusion.addInput(f2); + Float* f3 = new Float(); + fusion.addInput(f3); + Val* f4 = mul(f0, f1); + Val* f5 = sub(f2, f3); + + TensorView* tv2 = sub(tv1, f4); + TensorView* tv3 = add(tv0, f5); + TensorView* tv4 = mul(tv3, tv2); + + fusion.addOutput(tv4); + + // Lets setup to actually run + while (tv4->nDims() > 1) + tv4->merge(0); + tv4->split(0, 128); + tv4->split(0, 4); + + tv0->computeAt(tv4, 1); + tv1->computeAt(tv4, 1); + + tv4->axis(0)->parallelize(ParallelType::BIDx); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = static_cast(val); + + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } + + // f4 = f0 * f1 + // f5 = f2 - f3 + // t2 = t1 - f4 + // t3 = t0 + f5 + // t4 = t3 * t2 + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + float fl0 = 0.1; + float fl1 = -0.2; + float fl2 = 0.3; + float fl3 = -0.4; + float fl4 = fl0 * fl1; + float fl5 = fl2 - fl3; + + at::Tensor t0 = at::randn({129, 127}, options); + at::Tensor t1 = at::rand_like(t0, options); + + auto t2 = t1.sub(fl4); + auto t3 = t0.add(fl5); + auto t4 = t3.mul(t2); + + at::Tensor kernel_tv4 = at::empty_like(t0, options); + + at::Scalar test(fl0); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion( + {t0, + t1, + at::Scalar(fl0), + at::Scalar(fl1), + at::Scalar(fl2), + at::Scalar(fl3)}, + {kernel_tv4}); + + TORCH_CHECK(at::allclose(kernel_tv4, t4)); +} + +void testGPU_FusionLoopUnroll() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3); + TensorView* tv1 = makeDummyTensor(3); + + // Register your inputs + fusion.addInput(tv0); + fusion.addInput(tv1); + + // Do math with it, it returns a `Val*` but can be static_casted back to + // TensorView + TensorView* tv2 = add(tv1, new Float(2.0)); + TensorView* tv3 = add(tv0, tv2); + + // Register your outputs + fusion.addOutput(tv3); + + int block_size = 16; + + tv3->merge(0, 1); + tv3->merge(0, 1); + + tv3->split(0, block_size); + tv3->split(0, 4); + + // For all inputs, computeAt the output inline, temporaries should be squeezed + // between them + tv0->computeAt(tv3, 1); + tv1->computeAt(tv3, 1); + + // Parallelize + tv2->axis(1)->parallelize(ParallelType::Unroll); + tv3->axis(1)->parallelize(ParallelType::Unroll); + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(0)->parallelize(ParallelType::BIDx); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor input0 = at::rand({129, 13, 3}, options); + at::Tensor input1 = at::rand({129, 13, 3}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({input0, input1}); + + TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); +} + +/* + * Helper function for single op testing that generates a codegen operand + */ + +Val* gen_jit_operand(std::pair desc) { + if (desc.first == ValType::TensorView) { + return makeDummyTensor(2, desc.second); + } else if (desc.first == ValType::Scalar) { + if (desc.second == DataType::Float) + return new Float(); + else if (desc.second == DataType::Int) + return new Int(); + else + TORCH_CHECK("Not currently supported type", desc.first); + } else { + TORCH_CHECK("Not currently supported type", desc.first); + } + return nullptr; +} + +/* + * Helper function for single op testing that generates an ATen operand + */ + +IValue gen_aten_operand( + std::pair desc, + int blocks, int threads, bool rand) { if (desc.first == ValType::TensorView) { @@ -2012,7 +2492,7 @@ void test_op( gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); std::vector output_vect = {output}; cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); torch::jit::fuser::cuda::FusionExecutor fe; @@ -2020,7 +2500,7 @@ void test_op( fe.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); at::Tensor ref_output = af(aten_inputs); cudaDeviceSynchronize(); // This sync shouldn't be necessary; @@ -2054,12 +2534,8 @@ void test_op( op_str, " -- had a mismatch.", aten_inputs_to_str(), - "\nJIT: ", - output, - "\nREF: ", - ref_output, - "\nDIFF: ", - diff, + "\nABS MAX DIFF: ", + output.sub(ref_output).abs().max(), "\n"); } @@ -2385,14 +2861,8 @@ void testGPU_FusionCastOps() { "\nOp Type: -- ", "cast FP16->FP32->FP16", " -- had a mismatch.\n", - "IN1 : ", - input1, - "\n", - "JIT: ", - outputs[0], - "\n", - "REF: ", - ref_output, + "\nABS MAX DIFF: ", + outputs[0].sub(ref_output).abs().max(), "\n"); } @@ -3453,10 +3923,6 @@ void testGPU_FusionAdvancedIndexing() { FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); auto tv0 = makeDummyTensor(3); auto tv1 = makeDummyTensor(4); @@ -3465,10 +3931,42 @@ void testGPU_FusionAdvancedIndexing() { auto tv2 = add(tv0, new Float(1.0)); auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); - fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({x, y, z}, options); + at::Tensor t1 = at::randn({w, x, y, z}, options); + + fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + auto t2 = t0.add(1.0); + auto t3 = t2.add(t1); + + TORCH_CHECK(t3.allclose(outputs[0])); + } + + { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeConcreteTensor({10, 20}); + fusion.addInput(tv0); + TensorView* tv1 = makeConcreteTensor({10, 10, 20}); + fusion.addInput(tv1); + + TensorView* tv2 = add(tv0, new Float(1)); + TensorView* tv3 = broadcast(tv2, {true, false, false}); + TensorView* tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({10, 20}, options); + at::Tensor t1 = at::randn({10, 10, 20}, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); @@ -4624,23 +5122,21 @@ void testGPU_FusionReductionScheduler() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand({bid_x, tid_x}, options); + at::Tensor input = at::randn({bid_x, tid_x}, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); cuda::FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({red_dim}); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -4685,9 +5181,9 @@ void testGPU_FusionSymbolicReduction() { // How many threads to use for the block reduction int runtime_threadIdx_dim = 128; - torch::jit::fuser::cuda::FusionExecutor executor; - executor.compileFusion(&fusion); - auto outputs = executor.runFusion( + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( {input}, torch::jit::fuser::cuda::LaunchParams( -1, -1, -1, runtime_threadIdx_dim, -1, -1)); @@ -4716,24 +5212,22 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); at::Tensor cg_output = at::empty(tensor_dims_out, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -4758,26 +5252,26 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); - TORCH_CHECK( - cuda::scheduleReduction(&fusion, {input}, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-05, 1e-05), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } void testGPU_FusionReductionSchedulerDimShmoo() { - std::vector fp16_usage = {false}; + std::vector fp16_usage = {true, false}; std::vector red_axis = {1, 0}; std::vector output_dims = {320, 640}; std::vector red_dims; @@ -4821,40 +5315,31 @@ void testGPU_FusionReductionSchedulerDimShmoo() { .dtype((fp16 ? at::kHalf : at::kFloat)) .device(at::kCUDA, 0); at::Tensor input = - (axis ? at::rand({odim, rdim}, options) - : at::rand({rdim, odim}, options)); - - const at::ArrayRef inputs({input}); + (axis ? at::randn({odim, rdim}, options) + : at::randn({rdim, odim}, options)); - c10::optional rparams = - cuda::scheduleReduction(&fusion, inputs, tv1); - TORCH_CHECK(rparams != c10::nullopt, "Reduction is not found!"); + std::vector outputs_of_red; if (fp16) { - if (axis == 0) { - int tidx = rparams.value().lparams.bdimx(); - tv1_cast->split(-1, tidx); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDx); - tv1_cast->axis(-2)->parallelize(ParallelType::BIDx); - } else { - if (rparams.value().mul_reds_per_blk) { - int tidy = rparams.value().lparams.bdimy(); - tv1_cast->split(0, tidy); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDy); - } - tv1_cast->axis(0)->parallelize(ParallelType::BIDx); - } + outputs_of_red.push_back(tv1_cast); } + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), tv1, outputs_of_red); + torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto cg_output = fe.runFusion({input}); + auto outputs = + fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({axis}); TORCH_CHECK( - aten_output.allclose(cg_output[0]), + aten_output.allclose(outputs[0], 1e-03, 1e-03), "Error of: ", - aten_output.sub(cg_output[0]).abs().max()); + aten_output.sub(outputs[0]).abs().max()); } } } @@ -5203,6 +5688,7 @@ void testGPU_FusionSmem() { aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); } void testGPU_FusionSmemReduce() { @@ -5245,61 +5731,314 @@ void testGPU_FusionSmemReduce() { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); + auto outputs = fe.runFusion({t0}); + + at::Tensor aten_output = sum(t0, {1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); +} + +void testGPU_FusionSmemBlockGemm() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + constexpr int BSX = 16; + tv5->split(2, BSX); + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv6 = tv5->rFactor({-1}); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + + tv0->computeAt(tv5, 3); + tv1->computeAt(tv5, 3); + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + tv6->axis(-3)->parallelize(ParallelType::TIDy); + tv6->axis(-2)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemBlockGemmCache() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + // Remove reduction axis from tv5 + // tv6 = (M, R, N) + // tv5 = (M, N) + TensorView* tv6 = tv5->cache_before(); + + constexpr int BSX = 16; + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); + // tv5 = M/BSX, N/BSX, MSX, NSX + + tv6->computeAt(tv5, 2); + tv6->computeAt(tv5, 2); + + tv6->split(-1, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv7 = tv6->rFactor({-1}); + // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr + // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + + tv0->computeAt(tv6, 3); + tv1->computeAt(tv6, 3); + + tv0->computeAt(tv7, 3); + tv1->computeAt(tv7, 3); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + tv7->setMemoryType(MemoryType::Shared); + // Memory Type + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + + tv7->axis(-3)->parallelize(ParallelType::TIDy); + tv7->axis(-2)->parallelize(ParallelType::TIDx); + + tv6->axis(-2)->parallelize(ParallelType::TIDy); + tv6->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemDynamicReductionSymbolic() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Shared); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +void testGPU_FusionSmemDynamicReductionSymbolicArg() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + Int* sym_bsx = new Int(); + TensorView* tv0 = makeDummyTensor(3); // M, K, N + fusion.addInput(tv0); + fusion.addInput(sym_bsx); + + TensorView* tv1 = sum(tv0, {1}); // M, R, N + fusion.addOutput(tv1); + + TensorView* tv2 = tv0->cache_after(); + tv2->setMemoryType(MemoryType::Shared); + + // Schedule + constexpr int BSX = 32; + tv1->split(2, BSX); + tv1->split(1, sym_bsx); + tv1->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); + TensorView* tv3 = tv1->rFactor({-2}); + + tv0->computeAt(tv1, -2); + tv0->computeAt(tv3, -2); + + // Thread and Block binding + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::BIDy); + tv1->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K, N}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {t0, runtime_threadIdx_dim}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); at::Tensor aten_output = sum(t0, {1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); } -void testGPU_FusionSmemBlockGemm() { +void testGPU_FusionSmemDynamicPwiseMulSymbolicArgWAR() { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm + Int* sym_bsx = new Int(); TensorView* tv0 = makeDummyTensor(2); // (M, K) TensorView* tv1 = makeDummyTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N fusion.addInput(tv0); fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Schedule - constexpr int BSX = 16; - tv5->split(2, BSX); - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv6 = tv5->rFactor({-1}); + fusion.addInput(sym_bsx); + fusion.addOutput(tv4); + // Algorithm tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv0->computeAt(tv5, 3); - tv1->computeAt(tv5, 3); + constexpr int BSX = 32; + tv4->split(2, BSX); + tv4->split(1, sym_bsx); + tv4->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); + tv0->computeAt(tv4, 3); + tv1->computeAt(tv4, 3); + // Schedule + + tv4->axis(0)->parallelize(ParallelType::BIDx); + tv4->axis(2)->parallelize(ParallelType::BIDy); // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-3)->parallelize(ParallelType::TIDy); - tv6->axis(-2)->parallelize(ParallelType::TIDx); + // Thread and Block binding - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 128, K = 457, N = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); @@ -5307,100 +6046,231 @@ void testGPU_FusionSmemBlockGemm() { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); + auto outputs = fe.runFusion( + {t0, t1, BSX}, + torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1)); - at::Tensor aten_output = matmul(t0, t1); + at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1); } -void testGPU_FusionSmemBlockGemmCache() { -#if 0 +void testGPU_FusionSmemDynamicTiledGemm() { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm - TensorView* tv0 = makeDummyTensor(2); // (M, K) - TensorView* tv1 = makeDummyTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N + // Symbolic integers we will use for runtime tiling + Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z + Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x + Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x + // Compile-time integer for tiling + int n_smem_tile = 8; // bound to threadIdx.y + + // Symbolic 2D tensors TV0[M, K], TV1[K, N] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + // Broadcast tv0 to [M, K, *] + TensorView* tv2 = broadcast(tv0, {false, false, true}); + // Broadcast tv1 to [*, K, N] + TensorView* tv3 = broadcast(tv1, {true, false, false}); + + // Pointwise multiplication resulting in tv3[M, K, N] + TensorView* tv4 = mul(tv2, tv3); + + // Turn the K-dimension of tv4 into a reduction dimension + TensorView* tv5 = sum(tv4, {1}); + + // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); - // Schedule - // Remove reduction axis from tv5 - // tv6 = (M, R, N) - // tv5 = (M, N) - TensorView* tv6 = tv5->cache_before(); + // Register runtime tile dims as inputs + fusion.addInput(symbolic_m_tile_dim); + fusion.addInput(symbolic_split_k_tile_dim); + fusion.addInput(symbolic_block_k_tile_dim); - constexpr int BSX = 16; - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // tv5 = M/BSX, N/BSX, MSX, NSX + // Make a 3D tile, mix of symbolic and constant, do in reverse order because + // dims are inserted + tv5->split(2, n_smem_tile); + tv5->split(1, symbolic_block_k_tile_dim); + tv5->split(1, symbolic_split_k_tile_dim); + tv5->split(0, symbolic_m_tile_dim); - tv6->computeAt(tv5, 2); + // Reorder so all outer tiles are in the leftmost 3 positions + tv5->reorder({{1, 5}, {5, 1}}); + + // Factor out the outer reduction IterDomain, then run the inter-cta + // reduction, and intra-cta reduction + auto tv6 = tv5->rFactor({2}); + + // Scope computations tv6->computeAt(tv5, 2); - tv6->split(-1, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv7 = tv6->rFactor({-1}); - // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr - // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + // RFactor moves reduction axes around, reorder to match ordering of tv5 + tv6->reorder({ + {2, -2}, + {3, -1}, + {4, 2}, + {5, 3}, + {6, 4}, + }); + // Setup compute at schedule tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); + tv4->computeAt(tv6, -1); + // + // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) + // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) + // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] + // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] + // T5[ Mo, No, rKoi, rKii, Mi, Ni] - tv0->computeAt(tv7, 3); - tv1->computeAt(tv7, 3); - + // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv7->setMemoryType(MemoryType::Shared); - // Memory Type + tv4->setMemoryType(MemoryType::Local); + tv6->setMemoryType(MemoryType::Local); - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv7->axis(-3)->parallelize(ParallelType::TIDy); - tv7->axis(-2)->parallelize(ParallelType::TIDx); + std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; + for (auto tv : tv_list) { + tv->axis(-2)->parallelize(ParallelType::TIDz); + tv->axis(-1)->parallelize(ParallelType::TIDy); + } + tv2->axis(3)->parallelize(ParallelType::TIDx); + tv3->axis(3)->parallelize(ParallelType::TIDx); + tv4->axis(3)->parallelize(ParallelType::TIDx); + tv6->axis(3)->parallelize(ParallelType::TIDx); + tv5->axis(2)->parallelize(ParallelType::TIDx); - tv6->axis(-2)->parallelize(ParallelType::TIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(4)->parallelize(ParallelType::BIDx); + tv3->axis(4)->parallelize(ParallelType::BIDx); + tv4->axis(4)->parallelize(ParallelType::BIDx); + tv6->axis(4)->parallelize(ParallelType::BIDx); + tv5->axis(3)->parallelize(ParallelType::BIDx); - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 31, K = 65, N = 33; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); + at::Tensor A = at::randn({M, K}, options); + at::Tensor B = at::randn({K, N}, options); torch::jit::fuser::cuda::FusionExecutor fe; + // Generate CUDA and compile with nvRTC fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); - at::Tensor aten_output = matmul(t0, t1); + // Runtime tiling + int m_tile = 4; // bound to threadIdx.z + int split_k = 7; // bound to blockIdx.x + int intra_cta = 8; // bound to threadIdx.x + + auto fuser_outputs = fe.runFusion({A, B, m_tile, split_k, intra_cta}); + auto C_fuser = fuser_outputs[0]; + + at::Tensor aten_C = mul(A.unsqueeze(2), B.unsqueeze(0)).sum(1); + TORCH_CHECK( + aten_C.allclose(C_fuser, 1e-5, 1e-5), + "Error of: ", + aten_C.sub(C_fuser).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1); +} + +void testGPU_FusionGlobalIntermediate() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Global); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); -#endif +} + +void testGPU_FusionGlobalIntermediateDefaultSchedule() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(2); + TensorView* tv3 = makeDummyTensor(2); + TensorView* tv4 = sub(tv2, tv3); + TensorView* tv5 = add(tv1, tv4); + TensorView* tv6 = sub(tv5, tv0); + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addInput(tv2); + fusion.addInput(tv3); + fusion.addOutput(tv6); + // t6 = ((t1 + (t2 - t3)) - t0) + + tv4->setMemoryType(MemoryType::Global); + tv5->setMemoryType(MemoryType::Global); + tv6->setMemoryType(MemoryType::Global); + + constexpr int M = 32, N = 810; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor in0 = at::rand({M, N}, options); + at::Tensor in1 = at::rand({M, N}, options); + at::Tensor in2 = at::rand({M, N}, options); + at::Tensor in3 = at::rand({M, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({in0, in1, in2, in3}); + + at::Tensor aten_output = (in1 + (in2 - in3)) - in0; + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().sum()); } void testGPU_FusionConstCheck() { @@ -5990,6 +6860,195 @@ void testGPU_FusionThreadPredicate() { TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3)); } +void testGPU_FusionLSTMCell() { + const int hidden_features = 512; + const int batch_size = 64; + + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tvs[16]; + for (size_t i = 0; i < 16; i++) { + tvs[i] = makeDummyTensor(2); + fusion.addInput(tvs[i]); + } + + auto ingate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); + + auto forgetgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); + + auto cellgate = unaryOp( + UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); + + auto outgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); + + auto cx = makeContigTensor(2); + fusion.addInput(cx); + + auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); + + auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); + + fusion.addOutput(cy); + fusion.addOutput(hy); + + std::vector inputs; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor large_tensor0 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor1 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor2 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor3 = + at::randn({batch_size, hidden_features * 4}, options); + + auto chunked0 = large_tensor0.chunk(4, 1); + auto chunked1 = large_tensor1.chunk(4, 1); + auto chunked2 = large_tensor2.chunk(4, 1); + auto chunked3 = large_tensor3.chunk(4, 1); + + inputs.insert(inputs.end(), chunked0.begin(), chunked0.end()); + inputs.insert(inputs.end(), chunked1.begin(), chunked1.end()); + inputs.insert(inputs.end(), chunked2.begin(), chunked2.end()); + inputs.insert(inputs.end(), chunked3.begin(), chunked3.end()); + + auto at_ingate = + chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); + auto at_forgetgate = + chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); + auto at_cellgate = + chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); + auto at_outgate = + chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); + + auto at_cx = at::randn({batch_size, hidden_features}, options); + inputs.push_back(at_cx); + auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); + auto at_hy = at_outgate.mul(at_cy.tanh()); + + fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef(inputs)); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion(c10::ArrayRef(inputs)); + + TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7)); + TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7)); +} + +void testGPU_FusionComputeAtMultiBCast() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = broadcast(tv1, {true, false}); + TensorView* tv3 = broadcast(tv1, {false, true}); + TensorView* tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + // This is not supported and should throw an exception. + ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); +} + +void testGPU_FusionReductionHalf() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3, DataType::Half); + fusion.addInput(tv0); + + auto tv1 = castOp(DataType::Float, tv0); + auto tv2 = add(tv1, new Float(1.0)); + auto tv3 = sum(tv2, {2}); + auto tv4 = castOp(DataType::Half, tv3); + + fusion.addOutput(tv4); + + const auto options = + at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + at::Tensor input = at::randn({8, 8, 16}, options); + + auto reduction_tv = tv3; + + auto outputsOfReduction = DependencyCheck::getAllOutputsOf({reduction_tv}); + + // Grab only tensor views, though there shouldn't be any other type + auto tv_entries = ir_utils::filterByType(outputsOfReduction); + + std::vector tvOutputsOfReduction( + tv_entries.begin(), tv_entries.end()); + + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, reduction_tv); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), reduction_tv, tvOutputsOfReduction); + + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + + cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + // no broadcasting needed, omitting the last optional argument; + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); + + auto aten_output = input.to(c10::ScalarType::Float) + .add(1.0) + .sum({2}) + .to(c10::ScalarType::Half); + + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-04, 1e-04), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +void testGPU_FusionInputsIdLookup() { + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({16, 8, 8}, options); + at::Tensor t1 = at::randn({8, 8}, options); + at::Tensor t2 = at::randn({6, 4}, options); + + // create a cache with max size 2; + auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2); + + // testing basic function, same encoding for identical inputs + auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); + auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); + TORCH_CHECK(id_0.id == id_0_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 1); + TORCH_CHECK(id_0.eviction == false); + + // new input (even tho same shape, but we have different signature because of + // missing scalar input + auto id_1 = inputs_id_lookup.lookupId({t0, t1}); + auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1.id == id_1_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_1.eviction == false); + + // eviction should happen at this point + auto id_2 = inputs_id_lookup.lookupId({t2, t1}); + TORCH_CHECK(id_2.id != id_0.id); + TORCH_CHECK(id_2.id != id_1.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_2.eviction == true); + TORCH_CHECK(id_2.evict_id == id_0.id); + + // look at input 1 again + auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1_relook.id == id_1.id); + TORCH_CHECK(id_1_relook.eviction == false); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 0285559fb8fc..a058326c2050 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -135,7 +135,16 @@ namespace jit { _(GPU_FusionCompoundOps) \ _(GPU_FusionCastOps) \ _(GPU_FusionAdvancedComputeAt) \ + _(GPU_FusionComputeAtMultiConsumers) \ + _(GPU_FusionComputeAtCommonConsumer1) \ + _(GPU_FusionComputeAtCommonConsumer2) \ + _(GPU_FusionComputeAtCommonConsumer3) \ + _(GPU_FusionComputeAtNoCommonConsumer) \ _(GPU_FusionScalarInputs) \ + _(GPU_FusionBCastConcretizeBasic) \ + _(GPU_FusionBCastConcretizeRfactor) \ + _(GPU_FusionProveIdEqBasic) \ + _(GPU_FusionProveIdEqRfactor) \ _(GPU_FusionRFactorReplay) \ _(GPU_FusionReduction) \ _(GPU_FusionReduction2) \ @@ -183,6 +192,12 @@ namespace jit { _(GPU_FusionSmemReduce) \ _(GPU_FusionSmemBlockGemm) \ _(GPU_FusionSmemBlockGemmCache) \ + _(GPU_FusionSmemDynamicReductionSymbolic) \ + _(GPU_FusionSmemDynamicReductionSymbolicArg) \ + _(GPU_FusionSmemDynamicPwiseMulSymbolicArgWAR) \ + _(GPU_FusionSmemDynamicTiledGemm) \ + _(GPU_FusionGlobalIntermediate) \ + _(GPU_FusionGlobalIntermediateDefaultSchedule) \ _(GPU_FusionConstCheck) \ _(GPU_FusionSymbolicReduction) \ _(GPU_FusionUnrollWithAlloc) \ @@ -197,7 +212,11 @@ namespace jit { _(GPU_FusionTraversalOrder6) \ _(GPU_FusionTraversalOrder7) \ _(GPU_FusionBranches) \ - _(GPU_FusionThreadPredicate) + _(GPU_FusionThreadPredicate) \ + _(GPU_FusionLSTMCell) \ + _(GPU_FusionComputeAtMultiBCast) \ + _(GPU_FusionReductionHalf) \ + _(GPU_FusionInputsIdLookup) #else #define TH_FORALL_TESTS_CUDA(_) \ _(GraphExecutor) \ diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 9d61cd5dd157..0c8a1f9a967d 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -550,9 +550,8 @@ def t(x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) # end-2-end test of permutation & contiguity handling in integration. @@ -595,11 +594,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - # numerical issues here due to our scheduling. - # can't use `self.assertEqual(oo, jit_oo)` - self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4)) + self.assertEqual(o.dtype, jit_o.dtype) + # numerical issues here due to our scheduling. + # can't use `self.assertEqual(o, jit_o)` + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -630,6 +628,81 @@ def test_reduction_permutation(self): for perm1 in itertools.permutations(range(len(x))): self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + def test_reduction_multiple_output(self): + torch._C._jit_set_bailout_depth(2) + + def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor): + o = torch.mul(x, y) + o = torch.mul(o, scale) + out1 = torch.mul(o, z) + out2 = torch.sum(out1, dim=[2]) + return out1, out2 + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + scale = 0.5 + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + x = x.to(memory_format=torch.channels_last) + y = y.to(memory_format=torch.channels_last) + z = z.to(memory_format=torch.channels_last) + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_dtype(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2], dtype=torch.float32) + return o + t_jit = torch.jit.script(t) + + x = torch.randn(8, 4, 16, dtype=torch.float, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_half(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2]) + return o + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 16, dtype=torch.float16, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") @@ -651,9 +724,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -676,9 +748,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py index 4b9959c1231e..41e16df7d686 100644 --- a/test/test_jit_cuda_fuser_legacy.py +++ b/test/test_jit_cuda_fuser_legacy.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=legacy") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py index e2869eca7b5f..7559b85519c4 100644 --- a/test/test_jit_cuda_fuser_profiling.py +++ b/test/test_jit_cuda_fuser_profiling.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=profiling") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 174bb858da44..26ab975373a8 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -339,6 +339,7 @@ libtorch_cuda_sources = [ "torch/csrc/autograd/functions/comm.cpp", "torch/csrc/jit/codegen/cuda/arith.cpp", "torch/csrc/jit/codegen/cuda/compute_at.cpp", + "torch/csrc/jit/codegen/cuda/codegen.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", @@ -348,6 +349,7 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/fusion.cpp", "torch/csrc/jit/codegen/cuda/graph_fuser.cpp", "torch/csrc/jit/codegen/cuda/index_compute.cpp", + "torch/csrc/jit/codegen/cuda/instrumentation.cpp", "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp", "torch/csrc/jit/codegen/cuda/ir_cloner.cpp", "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp", @@ -357,8 +359,10 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/kernel.cpp", "torch/csrc/jit/codegen/cuda/kernel_cache.cpp", "torch/csrc/jit/codegen/cuda/kernel_ir.cpp", + "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp", "torch/csrc/jit/codegen/cuda/lower_index.cpp", "torch/csrc/jit/codegen/cuda/lower_loops.cpp", + "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp", "torch/csrc/jit/codegen/cuda/lower_unroll.cpp", "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp", "torch/csrc/jit/codegen/cuda/lower_utils.cpp", diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp new file mode 100644 index 000000000000..f6e791f0edba --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -0,0 +1,640 @@ + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +namespace { + +class CudaKernelGenerator : private OptInConstDispatch { + static constexpr char* kTab = " "; + + public: + static std::string generateKernelDefinition( + const Kernel* kernel, + const std::string& kernel_name) { + CudaKernelGenerator codegen(kernel); + codegen.genDeclaration(kernel_name); + codegen.startBlock(); + codegen.genPrologue(); + codegen.genBody(); + codegen.endBlock(); + TORCH_CHECK(codegen.block_nest_level_ == 0); + return codegen.code_.str(); + } + + private: + explicit CudaKernelGenerator(const Kernel* kernel) : kernel_(kernel) {} + + // Generates the kernel function declaration + void genDeclaration(const std::string& kernel_name) { + const auto& kernel_summary = kernel_->summary(); + + code_ << "__global__ void " << kernel_name << "("; + + std::vector params; + + // Inputs + for (auto val : kernel_->inputs()) { + params.push_back(val); + } + + // Outputs + for (auto val : kernel_->outputs()) { + params.push_back(val); + } + + // Global buffers + for (auto allocate : kernel_summary.global_allocations) { + params.push_back(allocate->buffer()); + } + + // Generate parameter declarations + for (Val* val : params) { + switch (val->getValType().value()) { + case ValType::KirTensorView: { + // TODO(kir): review this + const auto tv = val->as(); + code_ << "Tensor<" << val->getDataType().value() << ", " + << TensorDomain::noReductions( + tv->fuserTv()->getMaybeRFactorDomain()) + .size() + << "> " << gen(tv); + break; + } + case ValType::KirScalar: + code_ << val->getDataType().value() << " " << gen(val); + break; + default: + TORCH_CHECK(!"Unexpected parameter type"); + } + + if (val != params.back()) { + code_ << ", "; + } + } + + // Kernels generating random numbers take extra (seed, offset) arguments + if (kernel_summary.is_stochastic) { + code_ << ", unsigned long long seed, unsigned long long offset"; + } + + code_ << ") "; + } + + // Generates setup code which is executed before the kernel body + void genPrologue() { + const auto& kernel_summary = kernel_->summary(); + + // Random number generator (optional) + if (kernel_summary.is_stochastic) { + indent() << "const int idx = blockIdx.x*blockDim.x + threadIdx.x;\n"; + indent() << "Philox rnd(seed, idx, offset);\n"; + } + + // Do we have any dynamic shared memory buffers? + const bool has_dynamic_smem = + !kernel_summary.dynamic_smem_allocations.empty(); + + // Do we have any reductions? + const bool has_reductions = kernel_summary.has_block_reductions || + kernel_summary.has_grid_reductions; + + // Shared memory + if (has_dynamic_smem || has_reductions) { + indent() << "alignas(" + << dataTypeSize(kernel_summary.largest_smem_data_type) + << ") extern __shared__ char array[];\n"; + + if (has_dynamic_smem) { + indent() << "unsigned offset = 0;\n"; + } + + if (has_reductions) { + indent() << "void* shared_mem = array;\n"; + if (has_dynamic_smem) { + indent() << "offset += " + << "((blockDim.x * blockDim.y * blockDim.z) * sizeof(" + << kernel_summary.largest_smem_data_type << "));\n"; + } + } + } + } + + void genBody() { + for (auto expr : kernel_->topLevelExprs()) { + OptInConstDispatch::handle(expr); + } + } + + void startBlock(bool continuation = false) { + if (continuation) { + code_ << "{\n"; + } else { + indent() << "{\n"; + } + ++block_nest_level_; + } + + void endBlock(const char* sep = "\n") { + --block_nest_level_; + TORCH_CHECK(block_nest_level_ >= 0); + indent() << "}" << sep; + } + + std::ostream& indent() { + for (int i = 0; i < block_nest_level_; ++i) { + code_ << kTab; + } + return code_; + } + + std::string gen(const Statement* stmt) { + std::stringstream tmp_code; + std::swap(tmp_code, code_); + handle(stmt); + std::swap(tmp_code, code_); + return tmp_code.str(); + } + + std::string gen(const kir::TensorView* tv) { + std::stringstream tv_name; + tv_name << "T" << tv->name(); + return tv_name.str(); + } + + std::string genInline(const Statement* stmt) { + const bool saved_inline = print_inline_; + print_inline_ = true; + const auto result = gen(stmt); + print_inline_ = saved_inline; + return result; + } + + void handle(const Statement* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Expr* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Val* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const kir::Bool* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "b" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::Float* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "f" << node->name(); + } else { + const int digits = std::numeric_limits::max_digits10; + code_ << "float(" << std::setprecision(digits) << *node->value() << ")"; + } + } + + void handle(const kir::Half* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "h" << node->name(); + } else { + code_ << "__float2half(" << *node->value() << ")"; + } + } + + void handle(const kir::Int* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "i" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::NamedScalar* node) final { + code_ << node->name(); + } + + void handle(const kir::TensorIndex* node) final { + code_ << gen(node->view()) << "["; + + bool first = true; + for (auto* ind : node->indices()) { + if (!ind->isZeroInt()) { + if (!first) { + code_ << " + "; + } + code_ << genInline(ind); + first = false; + } + } + + if (first) { + code_ << "0"; + } + + code_ << "]"; + } + + void handle(const kir::IterDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorView* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::UnaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar() && !node->in()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + if (auto op = inline_op_str(node->getUnaryOpType())) { + code_ << *op << gen(node->in()); + } else { + if (node->getUnaryOpType() == UnaryOpType::Cast) { + const auto cast_str = + cast_func_str({node->in()->getDataType().value(), + node->out()->getDataType().value()}); + code_ << cast_str.value(); + } else { + code_ << node->getUnaryOpType(); + } + + code_ << "("; + if (node->getUnaryOpType() == UnaryOpType::RandLike) { + code_ << "rnd"; + } else { + code_ << gen(node->in()); + } + code_ << ")"; + } + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genBinaryOp( + BinaryOpType op_type, + const std::string& lhs, + const std::string& rhs) { + std::stringstream expr; + if (auto op = inline_op_str(op_type)) { + expr << lhs << " " << *op << " " << rhs; + } else { + expr << op_type << "(" << lhs << ", " << rhs << ")"; + } + return expr.str(); + } + + void handle(const kir::BinaryOp* node) final { + const auto op_type = node->getBinaryOpType(); + if (print_inline_) { + // Inline expression: `lhs op rhs` + code_ << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + indent() << gen(node->out()); + if (node->out()->isScalar()) { + // Single line: `out = lhs op rhs;` + code_ << " = " + << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + // Split TensorView expressions across multiple lines: + // + // out + // = lhs + // op rhs; + // + if (auto op = inline_op_str(op_type)) { + code_ << "\n"; + indent() << kTab << "= " << gen(node->lhs()) << "\n"; + indent() << kTab << *op << " " << gen(node->rhs()); + } else { + code_ << " = " << op_type << "(\n"; + indent() << kTab << gen(node->lhs()) << ",\n"; + indent() << kTab << gen(node->rhs()) << ")"; + } + } + code_ << ";\n"; + } + } + + void handle(const kir::TernaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + code_ << node->getTernaryOpType() << "(" << gen(node->in1()) << ", " + << gen(node->in2()) << ", " << gen(node->in3()) << ")"; + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genReductionOp(BinaryOpType op_type, DataType data_type) { + std::stringstream lambda; + lambda << "[](" << data_type << " &a, " << data_type << " b) " + << "{ a = " << genBinaryOp(op_type, "a", "b") << "; }"; + return lambda.str(); + } + + void handle(const kir::BroadcastOp* node) final { + const ir_utils::ParallelTypeBitmap domains = + ir_utils::getParallelBroadcastDomains( + node->out(), kernel_->predicateMap()); + + const bool thread_x = domains.get(ParallelType::TIDx); + const bool thread_y = domains.get(ParallelType::TIDy); + const bool thread_z = domains.get(ParallelType::TIDz); + const bool block_x = domains.get(ParallelType::BIDx); + const bool block_y = domains.get(ParallelType::BIDy); + const bool block_z = domains.get(ParallelType::BIDz); + + const bool grid_broadcast_needed = block_x || block_y || block_z; + const bool block_broadcast_needed = thread_x || thread_y || thread_z; + + TORCH_INTERNAL_ASSERT( + !grid_broadcast_needed, + "Parallel broadcast across blocks not supported"); + + if (block_broadcast_needed) { + const auto data_type = node->out()->getDataType().value(); + indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false") + << ", " << (thread_y ? "true" : "false") << ", " + << (thread_z ? "true" : "false") << ">(\n"; + indent() << kTab << gen(node->out()) << ",\n"; + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem));\n"; + } else { + indent() << gen(node->out()) << "\n"; + indent() << kTab << " = " << gen(node->in()) << ";\n"; + } + } + + void handle(const kir::ReductionOp* node) final { + TORCH_CHECK(node->out()->getValType() == ValType::TensorIndex); + + const auto out = node->out()->as(); + const auto domain = out->view()->domain(); + + const bool has_block_reduce = domain->hasBlockReduction(); + const bool has_grid_reduce = domain->hasGridReduction(); + + if (!has_block_reduce && !has_grid_reduce) { + const auto gen_out = gen(out); + const auto op_type = node->getReductionOpType(); + indent() << gen_out << " = " + << genBinaryOp(op_type, gen_out, gen(node->in())) << ";\n"; + return; + } + + const auto par_domains = node->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + + const auto data_type = node->out()->getDataType().value(); + const auto op_type = node->getReductionOpType(); + + if (has_block_reduce) { + if (has_grid_reduce) { + indent() << data_type << " " + << "block_result" + << ";\n"; + } + indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " + << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") + << ">(\n"; + if (has_grid_reduce) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(node->out()) << ",\n"; + } + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "threadIdx,\n"; + indent() << kTab << "blockDim,\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->init()) << ");\n"; + } + } + + void handle(const kir::GridReduction* node) final { + const auto rop = node->reduction_op(); + TORCH_INTERNAL_ASSERT(rop->out()->getValType() == ValType::TensorIndex); + + const auto out = rop->out()->as(); + const auto domain = out->view()->domain(); + TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); + + const auto par_domains = rop->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end(); + const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end(); + const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end(); + + const auto data_type = rop->out()->getDataType().value(); + const auto op_type = rop->getReductionOpType(); + + TORCH_INTERNAL_ASSERT( + node->reduction_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + TORCH_INTERNAL_ASSERT( + node->sync_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + const auto work_buffer = + node->reduction_buffer()->buffer()->as(); + const auto sync_buffer = + node->sync_buffer()->buffer()->as(); + + // Since block-level reduction is already done, those dimensions + // with tidx/y/z being true do not participate in the grid reduction. + indent() << kir::GridReduction::getPredicateFlagName(out->view()) << " = " + << "reduction::gridReduce<" << (bidx ? "true" : "false") << ", " + << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false") + << ", " << (!tidx ? "true" : "false") << ", " + << (!tidy ? "true" : "false") << ", " << (!tidz ? "true" : "false") + << ">(\n"; + indent() << kTab << gen(rop->out()) << ",\n"; + if (domain->hasBlockReduction()) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(rop->in()) << ",\n"; + } + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "&" << gen(work_buffer) << "[0],\n"; + indent() << kTab << gen(sync_buffer) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->reduction_op()->init()) << ");\n"; + } + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Woverloaded-virtual" + // TODO(Kir): fix me + void handle(const kir::Scope& scope) { + for (auto expr : scope.exprs()) { + handle(expr); + } + } +#pragma clang diagnostic pop + + void handle(const kir::ForLoop* node) final { + // TODO(kir): handle this during lowering + if (node->iter_domain()->isThread() || node->iter_domain()->isBroadcast()) { + handle(node->body()); + return; + } + + const auto gen_index = gen(node->index()); + const auto gen_start = genInline(node->iter_domain()->start()); + const auto gen_extent = genInline(node->iter_domain()->extent()); + indent() << "for(size_t " << gen_index << " = " << gen_start << "; " + << gen_index << " < " << gen_extent << "; ++" << gen_index << ") "; + + startBlock(true); + handle(node->body()); + endBlock(); + } + + void handle(const kir::IfThenElse* node) final { + indent() << "if (" << genInline(node->cond()) << ") "; + + // "then" block + startBlock(true); + handle(node->thenBody()); + + // "else" block (optional) + if (node->hasElse()) { + endBlock(" else "); + startBlock(true); + handle(node->elseBody()); + } + + endBlock(); + } + + // TODO(kir): fold initialization into Allocate + void handle(const kir::Allocate* node) final { + if (node->buffer()->getValType().value() != ValType::KirTensorView) { + indent() << node->buffer_type() << " " << gen(node->buffer()) << ";\n"; + return; + } + + const auto tv = node->buffer()->as(); + TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0); + TORCH_INTERNAL_ASSERT(node->size() != nullptr); + + switch (tv->memoryType()) { + case MemoryType::Global: + indent() << "// Allocate global tensor " << gen(tv) << "\n"; + break; + case MemoryType::Shared: + if (node->size()->isConstScalar()) { + // Static shared memory + indent() << "__shared__ " << node->buffer_type() << " " << gen(tv) + << "[" << genInline(node->size()) << "];\n"; + } else { + // Align Offset Position + indent() << "offset = alignBufferSize(offset," + << dataTypeSize(node->buffer_type()) << ");\n"; + // Shared Memory Pointer + indent() << node->buffer_type() << "* " << gen(tv) + << " = reinterpret_cast<" << node->buffer_type() << "*>" + << "(array + offset);\n"; + // Increment Offset Position + indent() << "offset += (" << genInline(node->size()) << " * sizeof(" + << node->buffer_type() << "));\n"; + } + break; + case MemoryType::Local: + indent() << node->buffer_type() << " " << gen(tv) << "[" + << genInline(node->size()) << "];\n"; + break; + default: + TORCH_INTERNAL_ASSERT(false, "Unexpected memory type"); + } + } + + void handle(const kir::Sync* node) final { + indent() << "__syncthreads();\n"; + } + + private: + std::stringstream code_; + const Kernel* kernel_; + int block_nest_level_ = 0; + + // TODO(kir): replace with explicit assignment statements + bool print_inline_ = false; +}; + +} // namespace + +std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name) { + FUSER_PERF_SCOPE("generateCudaKernel"); + return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name); +} + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h new file mode 100644 index 000000000000..562aa1554eb2 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -0,0 +1,22 @@ + +#pragma once + +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +//! Generates a CUDA kernel definition for the given kernel +TORCH_CUDA_API std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name = "CUDAGeneratedKernel"); + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 3e0f5303b966..9f8f7aba1cf4 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -20,11 +21,10 @@ ComputeAtData::ComputeAtData(TensorView* tv) void ComputeAtData::clearPass() { // If the last pass set a position, update the new_compute_at_position if // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - new_compute_at_position = - pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position; + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + new_compute_at_position = current_traversal_position; + } current_traversal_position_set = false; current_traversal_position = 0; @@ -52,16 +52,19 @@ void ComputeAtData::setPassPosition(unsigned int pos) { } unsigned int ComputeAtData::getNewPosition() const { - // If the last pass set a position, update the new_compute_at_position if - // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - return pass_pos > new_compute_at_position ? pass_pos - : new_compute_at_position; + // If the last pass set a position, return the latest position if + // it would be greater than previously set. + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + return current_traversal_position; + } else { + return new_compute_at_position; + } } void ComputeAtData::validateNewComputeAt() const { + FUSER_PERF_SCOPE("validateNewComputeAt"); + TORCH_INTERNAL_ASSERT( getNewPosition() >= original_compute_at_position, "Invalid computeAt detected. This computeAt would invalidate the set computeAt on ", @@ -82,7 +85,22 @@ void ComputeAtData::validateNewComputeAt() const { "."); } +void ComputeAtData::setComputeAtDomain(TensorDomain* td) { + if (new_compute_at_domain_ != original_domain_) { + TORCH_INTERNAL_ASSERT( + *new_compute_at_domain_ == *td, + "TensorDomain, ", + td, + ", does not match with the previously set domain of ", + tv_ref_, + ", which is ", + new_compute_at_domain_); + } + new_compute_at_domain_ = td; +} + namespace { + // Wrapper around set_intersection template std::set set_intersection(const std::set& set1, const std::set& set2) { @@ -121,12 +139,15 @@ std::deque> tvChains( } return tv_chains; } + } // namespace void ComputeAt::run( TensorView* producer, TensorView* consumer, unsigned int consumer_position) { + FUSER_PERF_SCOPE("ComputeAt::run"); + // Make sure the correct fusion is setup between this and consumer. TORCH_CHECK( producer->fusion() == consumer->fusion(), @@ -160,6 +181,9 @@ void ComputeAt::run( // Check all dependency chains, select the next TV after producer towards // consumer. These are the TVs we're going to actually call computeAt on. for (const auto& tv_chain : all_chains) { + // When a chain only has two tensors, they must be the producer, + // which is an input, and the consumer. There is nothing we need + // to do for such chains. if (tv_chain.size() > 2) { // Make sure we only add once, but we want to add in a determinsitic // order @@ -188,6 +212,8 @@ unsigned int ComputeAt::backwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int consumer_compute_at_axis) { + FUSER_PERF_SCOPE("backwardComputeAt_impl"); + auto& producer_entry = tv_data.at(producer); // Use TensorDomain interface so it doesn't set computeAt automatically @@ -209,6 +235,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int producer_compute_at_axis) { + FUSER_PERF_SCOPE("forwardComputeAt_impl"); + auto& consumer_entry = tv_data.at(consumer); const auto& producer_entry = tv_data.at(producer); @@ -229,6 +257,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( } void ComputeAt::setCommonConsumer() { + FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer"); + // Convert the first chain to a set. std::set common_consumers( producer_use_chains_.front().begin(), producer_use_chains_.front().end()); @@ -281,6 +311,8 @@ void ComputeAt::setCommonConsumer() { // Similar to backward traversal in traverseAllKnown but we should only apply // computeAt if it will increase computeAt positions. void ComputeAt::traverseBackward() { + FUSER_PERF_SCOPE("ComputeAt::traverseBackward"); + // propagate *backward* through all *producer* use_chains or from *producer* // to common_consumer if common_consumer exists. Only apply transform if // increases computeAt position. @@ -307,6 +339,8 @@ void ComputeAt::traverseBackward() { } void ComputeAt::traverseForward() { + FUSER_PERF_SCOPE("ComputeAt::traverseForward"); + // propagate forward through all *producer* use_chains or from *producer* to // common_consumer if common_consumer exists. auto chains = producer_use_chains_; @@ -338,6 +372,8 @@ void ComputeAt::traverseForward() { } void ComputeAt::runPass() { + FUSER_PERF_SCOPE("ComputeAt::runPass"); + // Initialize tv_data for all TensorViews we may modify auto chains = producer_use_chains_; if (common_consumer_ != nullptr) { @@ -382,6 +418,8 @@ void ComputeAt::runPass() { } void ComputeAt::setupOutputs() { + FUSER_PERF_SCOPE("ComputeAt::setupOutputs"); + if (common_consumer_ != nullptr) return; @@ -421,9 +459,6 @@ ComputeAt::ComputeAt( : producer_(_producer), consumer_(_consumer), consumer_position_(_consumer_position) { - if (consumer_position_ < 0) - consumer_position_ += consumer_->nDims(); - TORCH_INTERNAL_ASSERT( consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(), "Invalid computeAt axis, received ", diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 84677ae99448..a9112a6225ca 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -56,9 +56,7 @@ class ComputeAtData { // If we set computeAt, save the domain so we can reset it after traversal. // Traversal state can deviate from the domain we will want to save after the // entire computeAt pass. - void setComputeAtDomain(TensorDomain* td) { - new_compute_at_domain_ = td; - } + void setComputeAtDomain(TensorDomain* td); // Return domain set in setComputeAtDomain TensorDomain* getComputeAtDomain() const { diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore new file mode 100644 index 000000000000..1936cc1d441e --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore @@ -0,0 +1 @@ +html diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h new file mode 100644 index 000000000000..cfd4435461b9 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h @@ -0,0 +1,23 @@ + +#error This is used exclusively for generating the documentation (not a real header) + +//! \namespace torch::jit::fuser +//! \brief Main PyTorch JIT Fuser namespace + +//! \namespace torch::jit::fuser::cuda +//! \brief CUDA specific components + +//! \namespace torch::jit::fuser::cuda::executor_utils +//! \brief Fuser executor related utilities + +//! \namespace torch::jit::fuser::kir +//! \brief Kernel IR + +//! \namespace torch::jit::fuser::ir_utils +//! \brief IR manipulation utilities + +//! \namespace torch::jit::fuser::loop_utils +//! \brief Loop utilities + +//! \namespace torch::jit::fuser::scope_utils +//! \brief Scope utilities diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen new file mode 100644 index 000000000000..b9a51b187aa5 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen @@ -0,0 +1,2515 @@ +# Doxyfile 1.8.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. + +PROJECT_NAME = "PyTorch JIT Fuser" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 0 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. + +# TODO: switch to NO once key concepts are documented +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = NO + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT += .. +INPUT += documentation.h +INPUT += main_page.md + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE += + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS += Ui +EXCLUDE_SYMBOLS += internal +EXCLUDE_SYMBOLS += __* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = images + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = main_page.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = --std=c++1z + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: 0. + +CLANG_COMPILATION_DATABASE_PATH = 0 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /