From d3023d86ba6eebbcfeafc1633d1a8f8249d12e6d Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 4 Feb 2021 22:35:37 -0800
Subject: [PATCH] Revert D26249330: [Gradient Compression] Add a documentation
 page for DDP communication hooks

Test Plan: revert-hammer

Differential Revision:
D26249330 (https://github.com/pytorch/pytorch/commit/e62aabac43720b42f89cbe0ebd76fa22099ac698)

Original commit changeset: ab973390ddb7

fbshipit-source-id: d508daed76219e7ca588cf7fb38aeaaffc61acfd
---
 docs/source/ddp_comm_hooks.rst                |  68 ---------
 docs/source/index.rst                         |   1 -
 .../ddp_comm_hooks/powerSGD_hook.py           | 137 ++++++++----------
 torch/nn/parallel/distributed.py              |  13 +-
 4 files changed, 64 insertions(+), 155 deletions(-)
 delete mode 100644 docs/source/ddp_comm_hooks.rst

diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst
deleted file mode 100644
index b4908cab8da5..000000000000
--- a/docs/source/ddp_comm_hooks.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-DDP Communication Hooks
-=======================
-
-DDP communication hook is a generic interface to control how to communicate
-gradients across workers by overriding the vanilla allreduce in
-`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.>`_.
-A few built-in communication hooks are provided,
-and users can easily apply any of these hooks to optimize communication.
-Besides, the hook interface can also support user-defined communication
-strategies for more advanced use cases.
-
-.. warning ::
-    DDP communication hook is experimental and subject to change.
-
-.. warning ::
-    DDP communication hooks can only support single process single device mode
-    on NCCL backend.
-
-How to Use A Communication Hook?
---------------------------------
-
-To use a communication hook, the user just needs to let the DDP model register
-the hook before the training loop.
-
-.. automethod:: torch.nn.parallel.DistributedDataParallel.register_comm_hook
-
-Default Communication Hooks
----------------------------
-
-Default communication hooks are simple **stateless** hooks, so the input state
-in ``register_comm_hook`` is either a process group or ``None``.
-
-.. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
-    :members:
-
-PowerSGD Communication Hook
----------------------------
-
-PowerSGD communication hook is a **stateful** hook used for gradient
-compression, and the user needs to provide a state defined as below.
-The performance is `on par with <https://observablehq.com/@tvogels/powersgd-benchmark>`_
-the implementation in the original `paper <https://arxiv.org/abs/1905.13727>`_.
-
-PowerSGD State
-^^^^^^^^^^^^^^^^
-
-.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
-.. autoclass:: PowerSGDState
-
-PowerSGD Hooks
-^^^^^^^^^^^^^^^^
-
-.. warning ::
-    PowerSGD requires an extra copy of gradients for error feedback,
-    which may be infeasible for use cases that have a memory constraint.
-
-.. warning ::
-    The current implementation may cause gradient overflow for FP16 input.
-
-.. autofunction:: powerSGD_hook
-.. autofunction:: batched_powerSGD_hook
-
-Acknowledgements
-----------------
-
-Thanks PowerSGD paper author Thijs Vogels for the code review on PowerSGD
-communication hook and the
-`comparison experiments <https://observablehq.com/@tvogels/powersgd-benchmark>`_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 010df3c36e23..595d164df4a4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -71,7 +71,6 @@ Features described in this documentation are classified by release status:
    onnx
    optim
    complex_numbers
-   ddp_comm_hooks
    pipeline
    quantization
    rpc
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index eaae325253ce..60607b7bbf49 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -33,24 +33,22 @@ def _orthogonalize(matrix, epsilon=1e-8):
 
 
 class PowerSGDState(object):
-    r"""
+    """
     Stores both the gradient compression configs and the internal states for all the gradients during the training.
-    Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main configs that need to be tuned by the user.
-    Although ``use_error_feedback`` and ``warm_start`` can also be tuned by the user,
+    Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user.
+    Although `use_error_feedback` and `warm_start` can also be tuned by the user,
     they are typically turned on for performance.
 
-    Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``]
+    Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]
     ~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    1. To tune ``matrix_approximation_rank``, the user can increase it from 1 by factors of 2,
+    1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2,
     until a satisfying accuracy can be reached.
-    The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression.
-    However, the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` value.
-
-    2. To tune ``start_powerSGD_iter``, the user can typically start with 10% of total training steps,
+    The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression.
+    However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value.
+    2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps,
     and increase it until a satisfying accuracy can be reached.
     Deferrring PowerSGD can effectively improve the accuracy,
-    even a relatively small ``matrix_approximation_rank`` is used.
+    even a relatively small `matrix_approximation_rank` is used.
     This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients,
     and compressing gradients too early may make the training quickly take a suboptimal trajectory,
     which can result in an irrecoverable impact on the accuracy.
@@ -164,44 +162,38 @@ def maybe_increase_iter(self, bucket):
 
 
 def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    r"""
+    """
     This DDP communication hook implements the original PowerSGD gradient compression
     algorithm described in https://arxiv.org/abs/1905.13727.
     Once gradient tensors are aggregated across all workers, this hook applies
     compression as follows:
-
-    1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases).
-
-    2. Handles rank-1 tensors by allreducing them without compression:
-
-        2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression;
-
-        2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
-
-    3. Handles high-rank tensors by PowerSGD compression:
-
-        3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
+    1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors:
+    high-rank tensors and vector-like rank-1 tensors (for biases).
+    2) Handles rank-1 tensors by allreducing them without compression:
+        2.1) Allocate contiguous memory for those rank-1 tensors,
+        and allreduces all the rank-1 tensors as a batch, without compression;
+        2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
+    3) Handles high-rank tensors by PowerSGD compression:
+        3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
         such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-
-        3.2. Computes each P in Ps, which is equal to MQ;
-
-        3.3. Allreduces Ps as a batch;
-
-        3.4. Orthogonalizes each P in Ps;
-
-        3.5. Computes each Q in Qs, which is approximately equal to M^TP;
-
-        3.6. Allreduces Qs as a batch;
-
-        3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
-
-    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+        3.2) Computes each P in Ps, which is equal to MQ;
+        3.3) Allreduces Ps as a batch;
+        3.4) Orthogonalizes each P in Ps;
+        3.5) Computes each Q in Qs, which is approximately equal to M^TP;
+        3.6) Allreduces Qs as a batch;
+        3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
+
+    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
 
+    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
+    one left multiplication and one right multiplication.
+    For warm-start, can take one such step at a time, and alternate between them.
+
     Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``].
+            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
             only exactly one tensor is stored in this bucket.
@@ -210,9 +202,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
         Future handler of the communication, which updates the gradients in place.
 
     Example::
-        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
         >>> ddp_model.register_comm_hook(state, powerSGD_hook)
-    """  # noqa
+    """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
@@ -382,10 +374,6 @@ def compute_qs(fut):
         for tensor, p, q in zip(high_rank_tensors, ps, qs):
             torch.matmul(tensor.t(), p, out=q)
 
-        # TODO: The above procedure does two matmul+allreduce steps per iteration --
-        # one left multiplication and one right multiplication.
-        # For warm-start, can take one such step at a time, and alternate between them.
-
         # Allreduce Qs.
         return [
             dist.all_reduce(
@@ -424,44 +412,40 @@ def decompress(fut):
 
 
 def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    r"""
+    """
     This DDP communication hook implements a simplified PowerSGD gradient compression
     algorithm described in https://arxiv.org/abs/1905.13727.
     Once gradient tensors are aggregated across all workers, this hook applies
     compression to the flattened input tensor that batches per-parameter tensors as follows:
-
-    1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
-
-    2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-
-    3. Computes P, which is equal to MQ;
-
-    4. Allreduces P;
-
-    5. Orthogonalizes P;
-
-    6. Computes Q, which is approximately equal to M^TP;
-
-    7. Allreduces Q;
-
-    8. Computes M, which is approximately equal to PQ^T.
-
-    9. Truncates the input tensor to the original length.
-
-    This variant is faster than :meth:`powerSGD_hook` that runs layer-wise gradient compression,
-    but it usually results in a much lower accuracy, unless ``matrix_approximation_rank`` in the state is 1.
-    Increasing ``matrix_approximation_rank`` may not necessarily increase the accuracy,
+    1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
+    2) Creates two low-rank tensors P and Q for decomposing M,
+    such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+    2) Computes P, which is equal to MQ;
+    3) Allreduces P;
+    4) Orthogonalizes P;
+    5) Computes Q, which is approximately equal to M^TP;
+    6) Allreduces Q;
+    7) Computes M, which is approximately equal to PQ^T.
+    8) Truncates the input tensor to the original length.
+
+    This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression,
+    but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1.
+    Increasing `matrix_approximation_rank` may not necessarily increase the accuracy,
     because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
-    Therefore, the user should always consider :meth:`powerSGD_hook` first,
-    and only consider this variant when a satisfying accuracy can be achieved when ``matrix_approximation_rank`` is 1.
+    Therefore, the user shoud always consider `powerSGD_hook` first,
+    and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1.
 
-    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
 
+    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
+    one left multiplication and one right multiplication.
+    For warm-start, can take one such step at a time, and alternate between them.
+
     Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune ``matrix_approximation_rank`` And ``start_powerSGD_iter``].
+            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
             only exactly one tensor is stored in this bucket.
@@ -470,9 +454,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
         Future handler of the communication, which updates the gradients in place.
 
     Example::
-        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
+        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
         >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
-    """  # noqa
+    """
     process_group = state.process_group
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
@@ -579,11 +563,6 @@ def compute_q(fut):
             out=state.q_memory_dict[bucket_index],
         )
 
-
-        # TODO: The above procedure does two matmul+allreduce steps per iteration --
-        # one left multiplication and one right multiplication.
-        # For warm-start, can take one such step at a time, and alternate between them.
-
         return [
             dist.all_reduce(
                 state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index b1c0cbe65fc2..876309f4589f 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1021,13 +1021,12 @@ def register_comm_hook(self, state: object, hook: callable):
         parameter syncs while running Distributed DataParallel training.
 
         Args:
-            state (object): Passed to the hook to maintain any state information during the training process.
-                            Examples include error feedback in gradient compression,
-                            peers to communicate with next in GossipGrad, etc.
-
-                            It is locally stored by each worker
-                            and shared by all the gradient tensors on the worker.
-            hook (callable): Averages gradient tensors across workers and defined as:
+            state (object): state is passed to the hook and can be used to maintain
+                            and update any state information that users would like to
+                            maintain as part of the training process. Examples: error
+                            feedback in gradient compression, peers to communicate with
+                            next in GossipGrad etc.
+            hook (callable): is defined as:
                              hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future:
 
                              This function is called once the bucket is ready. The