Revert D26249330: [Gradient Compression] Add a documentation page for…

… DDP communication hooks Test Plan: revert-hammer Differential Revision: D26249330 (e62aaba) Original commit changeset: ab973390ddb7 fbshipit-source-id: d508daed76219e7ca588cf7fb38aeaaffc61acfd
pytorch · Feb 5, 2021 · d3023d8 · d3023d8
1 parent 1065c2d
commit d3023d8
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 155 deletions.
diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -71,7 +71,6 @@ Features described in this documentation are classified by release status:
    onnx
    optim
    complex_numbers
-   ddp_comm_hooks
    pipeline
    quantization
    rpc

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -33,24 +33,22 @@ def _orthogonalize(matrix, epsilon=1e-8):
 
 
 class PowerSGDState(object):
-    r"""
+    """
     Stores both the gradient compression configs and the internal states for all the gradients during the training.
-    Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main configs that need to be tuned by the user.
-    Although ``use_error_feedback`` and ``warm_start`` can also be tuned by the user,
+    Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user.
+    Although `use_error_feedback` and `warm_start` can also be tuned by the user,
     they are typically turned on for performance.
 
-    Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``]
+    Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]
     ~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    1. To tune ``matrix_approximation_rank``, the user can increase it from 1 by factors of 2,
+    1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2,
     until a satisfying accuracy can be reached.
-    The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression.
-    However, the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` value.
-
-    2. To tune ``start_powerSGD_iter``, the user can typically start with 10% of total training steps,
+    The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression.
+    However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value.
+    2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps,
     and increase it until a satisfying accuracy can be reached.
     Deferrring PowerSGD can effectively improve the accuracy,
-    even a relatively small ``matrix_approximation_rank`` is used.
+    even a relatively small `matrix_approximation_rank` is used.
     This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients,
     and compressing gradients too early may make the training quickly take a suboptimal trajectory,
     which can result in an irrecoverable impact on the accuracy.
@@ -164,44 +162,38 @@ def maybe_increase_iter(self, bucket):
 
 
 def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    r"""
+    """
     This DDP communication hook implements the original PowerSGD gradient compression
     algorithm described in https://arxiv.org/abs/1905.13727.
     Once gradient tensors are aggregated across all workers, this hook applies
     compression as follows:
-
-    1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases).
-
-    2. Handles rank-1 tensors by allreducing them without compression:
-
-        2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression;
-
-        2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
-
-    3. Handles high-rank tensors by PowerSGD compression:
-
-        3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
+    1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors:
+    high-rank tensors and vector-like rank-1 tensors (for biases).
+    2) Handles rank-1 tensors by allreducing them without compression:
+        2.1) Allocate contiguous memory for those rank-1 tensors,
+        and allreduces all the rank-1 tensors as a batch, without compression;
+        2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
+    3) Handles high-rank tensors by PowerSGD compression:
+        3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
         such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-
-        3.2. Computes each P in Ps, which is equal to MQ;
-
-        3.3. Allreduces Ps as a batch;
-
-        3.4. Orthogonalizes each P in Ps;
-
-        3.5. Computes each Q in Qs, which is approximately equal to M^TP;
-
-        3.6. Allreduces Qs as a batch;
-
-        3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
-
-    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+        3.2) Computes each P in Ps, which is equal to MQ;
+        3.3) Allreduces Ps as a batch;
+        3.4) Orthogonalizes each P in Ps;
+        3.5) Computes each Q in Qs, which is approximately equal to M^TP;
+        3.6) Allreduces Qs as a batch;
+        3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
+
+    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
 
+    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
+    one left multiplication and one right multiplication.
+    For warm-start, can take one such step at a time, and alternate between them.
+
     Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``].
+            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
             only exactly one tensor is stored in this bucket.
@@ -210,9 +202,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
         Future handler of the communication, which updates the gradients in place.
 
     Example::
-        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
         >>> ddp_model.register_comm_hook(state, powerSGD_hook)
-    """  # noqa
+    """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
@@ -382,10 +374,6 @@ def compute_qs(fut):
         for tensor, p, q in zip(high_rank_tensors, ps, qs):
             torch.matmul(tensor.t(), p, out=q)
 
-        # TODO: The above procedure does two matmul+allreduce steps per iteration --
-        # one left multiplication and one right multiplication.
-        # For warm-start, can take one such step at a time, and alternate between them.
-
         # Allreduce Qs.
         return [
             dist.all_reduce(
@@ -424,44 +412,40 @@ def decompress(fut):
 
 
 def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    r"""
+    """
     This DDP communication hook implements a simplified PowerSGD gradient compression
     algorithm described in https://arxiv.org/abs/1905.13727.
     Once gradient tensors are aggregated across all workers, this hook applies
     compression to the flattened input tensor that batches per-parameter tensors as follows:
-
-    1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
-
-    2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-
-    3. Computes P, which is equal to MQ;
-
-    4. Allreduces P;
-
-    5. Orthogonalizes P;
-
-    6. Computes Q, which is approximately equal to M^TP;
-
-    7. Allreduces Q;
-
-    8. Computes M, which is approximately equal to PQ^T.
-
-    9. Truncates the input tensor to the original length.
-
-    This variant is faster than :meth:`powerSGD_hook` that runs layer-wise gradient compression,
-    but it usually results in a much lower accuracy, unless ``matrix_approximation_rank`` in the state is 1.
-    Increasing ``matrix_approximation_rank`` may not necessarily increase the accuracy,
+    1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
+    2) Creates two low-rank tensors P and Q for decomposing M,
+    such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+    2) Computes P, which is equal to MQ;
+    3) Allreduces P;
+    4) Orthogonalizes P;
+    5) Computes Q, which is approximately equal to M^TP;
+    6) Allreduces Q;
+    7) Computes M, which is approximately equal to PQ^T.
+    8) Truncates the input tensor to the original length.
+
+    This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression,
+    but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1.
+    Increasing `matrix_approximation_rank` may not necessarily increase the accuracy,
     because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
-    Therefore, the user should always consider :meth:`powerSGD_hook` first,
-    and only consider this variant when a satisfying accuracy can be achieved when ``matrix_approximation_rank`` is 1.
+    Therefore, the user shoud always consider `powerSGD_hook` first,
+    and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1.
 
-    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
 
+    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
+    one left multiplication and one right multiplication.
+    For warm-start, can take one such step at a time, and alternate between them.
+
     Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``].
+            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
             only exactly one tensor is stored in this bucket.
@@ -470,9 +454,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
         Future handler of the communication, which updates the gradients in place.
 
     Example::
-        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
+        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
         >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
-    """  # noqa
+    """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
@@ -579,11 +563,6 @@ def compute_q(fut):
             out=state.q_memory_dict[bucket_index],
         )
 
-
-        # TODO: The above procedure does two matmul+allreduce steps per iteration --
-        # one left multiplication and one right multiplication.
-        # For warm-start, can take one such step at a time, and alternate between them.
-
         return [
             dist.all_reduce(
                 state.q_memory_dict[bucket_index], group=group_to_use, async_op=True

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -1021,13 +1021,12 @@ def register_comm_hook(self, state: object, hook: callable):
         parameter syncs while running Distributed DataParallel training.
 
         Args:
-            state (object): Passed to the hook to maintain any state information during the training process.
-                            Examples include error feedback in gradient compression,
-                            peers to communicate with next in GossipGrad, etc.
-
-                            It is locally stored by each worker
-                            and shared by all the gradient tensors on the worker.
-            hook (callable): Averages gradient tensors across workers and defined as:
+            state (object): state is passed to the hook and can be used to maintain
+                            and update any state information that users would like to
+                            maintain as part of the training process. Examples: error
+                            feedback in gradient compression, peers to communicate with
+                            next in GossipGrad etc.
+            hook (callable): is defined as:
                              hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future:
 
                              This function is called once the bucket is ready. The