From d3023d86ba6eebbcfeafc1633d1a8f8249d12e6d Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Thu, 4 Feb 2021 22:35:37 -0800 Subject: [PATCH] Revert D26249330: [Gradient Compression] Add a documentation page for DDP communication hooks Test Plan: revert-hammer Differential Revision: D26249330 (https://github.com/pytorch/pytorch/commit/e62aabac43720b42f89cbe0ebd76fa22099ac698) Original commit changeset: ab973390ddb7 fbshipit-source-id: d508daed76219e7ca588cf7fb38aeaaffc61acfd --- docs/source/ddp_comm_hooks.rst | 68 --------- docs/source/index.rst | 1 - .../ddp_comm_hooks/powerSGD_hook.py | 137 ++++++++---------- torch/nn/parallel/distributed.py | 13 +- 4 files changed, 64 insertions(+), 155 deletions(-) delete mode 100644 docs/source/ddp_comm_hooks.rst diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst deleted file mode 100644 index b4908cab8da5..000000000000 --- a/docs/source/ddp_comm_hooks.rst +++ /dev/null @@ -1,68 +0,0 @@ -DDP Communication Hooks -======================= - -DDP communication hook is a generic interface to control how to communicate -gradients across workers by overriding the vanilla allreduce in -`DistributedDataParallel `_. -A few built-in communication hooks are provided, -and users can easily apply any of these hooks to optimize communication. -Besides, the hook interface can also support user-defined communication -strategies for more advanced use cases. - -.. warning :: - DDP communication hook is experimental and subject to change. - -.. warning :: - DDP communication hooks can only support single process single device mode - on NCCL backend. - -How to Use A Communication Hook? --------------------------------- - -To use a communication hook, the user just needs to let the DDP model register -the hook before the training loop. - -.. automethod:: torch.nn.parallel.DistributedDataParallel.register_comm_hook - -Default Communication Hooks ---------------------------- - -Default communication hooks are simple **stateless** hooks, so the input state -in ``register_comm_hook`` is either a process group or ``None``. - -.. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks - :members: - -PowerSGD Communication Hook ---------------------------- - -PowerSGD communication hook is a **stateful** hook used for gradient -compression, and the user needs to provide a state defined as below. -The performance is `on par with `_ -the implementation in the original `paper `_. - -PowerSGD State -^^^^^^^^^^^^^^^^ - -.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook -.. autoclass:: PowerSGDState - -PowerSGD Hooks -^^^^^^^^^^^^^^^^ - -.. warning :: - PowerSGD requires an extra copy of gradients for error feedback, - which may be infeasible for use cases that have a memory constraint. - -.. warning :: - The current implementation may cause gradient overflow for FP16 input. - -.. autofunction:: powerSGD_hook -.. autofunction:: batched_powerSGD_hook - -Acknowledgements ----------------- - -Thanks PowerSGD paper author Thijs Vogels for the code review on PowerSGD -communication hook and the -`comparison experiments `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index 010df3c36e23..595d164df4a4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -71,7 +71,6 @@ Features described in this documentation are classified by release status: onnx optim complex_numbers - ddp_comm_hooks pipeline quantization rpc diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index eaae325253ce..60607b7bbf49 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -33,24 +33,22 @@ def _orthogonalize(matrix, epsilon=1e-8): class PowerSGDState(object): - r""" + """ Stores both the gradient compression configs and the internal states for all the gradients during the training. - Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main configs that need to be tuned by the user. - Although ``use_error_feedback`` and ``warm_start`` can also be tuned by the user, + Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user. + Although `use_error_feedback` and `warm_start` can also be tuned by the user, they are typically turned on for performance. - Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``] + Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`] ~~~~~~~~~~~~~~~~~~~~~~~~~~ - - 1. To tune ``matrix_approximation_rank``, the user can increase it from 1 by factors of 2, + 1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2, until a satisfying accuracy can be reached. - The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression. - However, the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` value. - - 2. To tune ``start_powerSGD_iter``, the user can typically start with 10% of total training steps, + The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression. + However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value. + 2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps, and increase it until a satisfying accuracy can be reached. Deferrring PowerSGD can effectively improve the accuracy, - even a relatively small ``matrix_approximation_rank`` is used. + even a relatively small `matrix_approximation_rank` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy. @@ -164,44 +162,38 @@ def maybe_increase_iter(self, bucket): def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: - r""" + """ This DDP communication hook implements the original PowerSGD gradient compression algorithm described in https://arxiv.org/abs/1905.13727. Once gradient tensors are aggregated across all workers, this hook applies compression as follows: - - 1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases). - - 2. Handles rank-1 tensors by allreducing them without compression: - - 2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression; - - 2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor. - - 3. Handles high-rank tensors by PowerSGD compression: - - 3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M, + 1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: + high-rank tensors and vector-like rank-1 tensors (for biases). + 2) Handles rank-1 tensors by allreducing them without compression: + 2.1) Allocate contiguous memory for those rank-1 tensors, + and allreduces all the rank-1 tensors as a batch, without compression; + 2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor. + 3) Handles high-rank tensors by PowerSGD compression: + 3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; - - 3.2. Computes each P in Ps, which is equal to MQ; - - 3.3. Allreduces Ps as a batch; - - 3.4. Orthogonalizes each P in Ps; - - 3.5. Computes each Q in Qs, which is approximately equal to M^TP; - - 3.6. Allreduces Qs as a batch; - - 3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T. - - Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations. + 3.2) Computes each P in Ps, which is equal to MQ; + 3.3) Allreduces Ps as a batch; + 3.4) Orthogonalizes each P in Ps; + 3.5) Computes each Q in Qs, which is approximately equal to M^TP; + 3.6) Allreduces Qs as a batch; + 3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T. + + Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations. This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy, but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers. + TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration -- + one left multiplication and one right multiplication. + For warm-start, can take one such step at a time, and alternate between them. + Args: state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc. - To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``]. + To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]. bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. Note that since DDP comm hook only supports single process single device mode at this time, only exactly one tensor is stored in this bucket. @@ -210,9 +202,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: Future handler of the communication, which updates the gradients in place. Example:: - >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10) + state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10) >>> ddp_model.register_comm_hook(state, powerSGD_hook) - """ # noqa + """ process_group = state.process_group group_to_use = process_group if process_group is not None else dist.group.WORLD world_size = group_to_use.size() @@ -382,10 +374,6 @@ def compute_qs(fut): for tensor, p, q in zip(high_rank_tensors, ps, qs): torch.matmul(tensor.t(), p, out=q) - # TODO: The above procedure does two matmul+allreduce steps per iteration -- - # one left multiplication and one right multiplication. - # For warm-start, can take one such step at a time, and alternate between them. - # Allreduce Qs. return [ dist.all_reduce( @@ -424,44 +412,40 @@ def decompress(fut): def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: - r""" + """ This DDP communication hook implements a simplified PowerSGD gradient compression algorithm described in https://arxiv.org/abs/1905.13727. Once gradient tensors are aggregated across all workers, this hook applies compression to the flattened input tensor that batches per-parameter tensors as follows: - - 1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings; - - 2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; - - 3. Computes P, which is equal to MQ; - - 4. Allreduces P; - - 5. Orthogonalizes P; - - 6. Computes Q, which is approximately equal to M^TP; - - 7. Allreduces Q; - - 8. Computes M, which is approximately equal to PQ^T. - - 9. Truncates the input tensor to the original length. - - This variant is faster than :meth:`powerSGD_hook` that runs layer-wise gradient compression, - but it usually results in a much lower accuracy, unless ``matrix_approximation_rank`` in the state is 1. - Increasing ``matrix_approximation_rank`` may not necessarily increase the accuracy, + 1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings; + 2) Creates two low-rank tensors P and Q for decomposing M, + such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; + 2) Computes P, which is equal to MQ; + 3) Allreduces P; + 4) Orthogonalizes P; + 5) Computes Q, which is approximately equal to M^TP; + 6) Allreduces Q; + 7) Computes M, which is approximately equal to PQ^T. + 8) Truncates the input tensor to the original length. + + This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression, + but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1. + Increasing `matrix_approximation_rank` may not necessarily increase the accuracy, because batching per-parameter tensors without column/row alignment can destroy low-rank structure. - Therefore, the user should always consider :meth:`powerSGD_hook` first, - and only consider this variant when a satisfying accuracy can be achieved when ``matrix_approximation_rank`` is 1. + Therefore, the user shoud always consider `powerSGD_hook` first, + and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1. - Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations. + Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations. This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy, but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers. + TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration -- + one left multiplication and one right multiplication. + For warm-start, can take one such step at a time, and alternate between them. + Args: state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc. - To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``]. + To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]. bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. Note that since DDP comm hook only supports single process single device mode at this time, only exactly one tensor is stored in this bucket. @@ -470,9 +454,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: Future handler of the communication, which updates the gradients in place. Example:: - >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1) + state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1) >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook) - """ # noqa + """ process_group = state.process_group group_to_use = process_group if process_group is not None else dist.group.WORLD world_size = group_to_use.size() @@ -579,11 +563,6 @@ def compute_q(fut): out=state.q_memory_dict[bucket_index], ) - - # TODO: The above procedure does two matmul+allreduce steps per iteration -- - # one left multiplication and one right multiplication. - # For warm-start, can take one such step at a time, and alternate between them. - return [ dist.all_reduce( state.q_memory_dict[bucket_index], group=group_to_use, async_op=True diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index b1c0cbe65fc2..876309f4589f 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -1021,13 +1021,12 @@ def register_comm_hook(self, state: object, hook: callable): parameter syncs while running Distributed DataParallel training. Args: - state (object): Passed to the hook to maintain any state information during the training process. - Examples include error feedback in gradient compression, - peers to communicate with next in GossipGrad, etc. - - It is locally stored by each worker - and shared by all the gradient tensors on the worker. - hook (callable): Averages gradient tensors across workers and defined as: + state (object): state is passed to the hook and can be used to maintain + and update any state information that users would like to + maintain as part of the training process. Examples: error + feedback in gradient compression, peers to communicate with + next in GossipGrad etc. + hook (callable): is defined as: hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future: This function is called once the bucket is ready. The