From fac738cc714cd589233770727ceb1c031aa2eb9b Mon Sep 17 00:00:00 2001 From: wayi Date: Thu, 26 Nov 2020 16:36:02 -0800 Subject: [PATCH 1/3] [Gradient Compression] Add a random generator to PowerSGD state for initializing low-rank matrix Q Previously the random seed is the length of input tensor, which is not guaranteed to be the different for different batches. Now initialize a random generator in PowerSGD state, and use this generator to create a random seed to randomize the low-rank tensor Q at every step. Therefore, the initial tensor Q should be the same across all the replicas at the same step, but different at different steps. 'torch.manual_seed' is used in the same way as https://github.com/epfml/powersgd/blob/master/gradient_reducers.py#L675 Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 Differential Revision: [D25191589](https://our.internmc.facebook.com/intern/diff/D25191589/) [ghstack-poisoned] --- .../algorithms/ddp_comm_hooks/__init__.py | 8 ++++-- .../ddp_comm_hooks/powerSGD_hook.py | 25 ++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py index 11aec8ab8a61..f25f3a8caad8 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py @@ -15,13 +15,17 @@ def _ddp_comm_hook_wrapper(comm_hook, model, state): model.register_comm_hook(state, comm_hook) -def _powerSGD_comm_hook_wrapper(comm_hook, model, state, matrix_approximation_rank): +def _powerSGD_comm_hook_wrapper( + comm_hook, model, state, matrix_approximation_rank, random_seed=0 +): """ To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group, which will be wrapped up with other state info. """ powerSGD_state = powerSGD.PowerSGDState( - process_group=state, matrix_approximation_rank=matrix_approximation_rank + process_group=state, + matrix_approximation_rank=matrix_approximation_rank, + random_seed=random_seed, ) model.register_comm_hook(powerSGD_state, comm_hook) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 814a24cf262a..85d4fe553b52 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -1,5 +1,6 @@ import math +import numpy as np import torch import torch.distributed as dist @@ -29,11 +30,12 @@ def _orthogonalize(matrix, epsilon=1e-8): class PowerSGDState(object): - __slots__ = ["process_group", "matrix_approximation_rank"] + __slots__ = ["process_group", "matrix_approximation_rank", "rng"] - def __init__(self, process_group, matrix_approximation_rank=1): + def __init__(self, process_group, matrix_approximation_rank=1, random_seed=0): self.process_group = process_group self.matrix_approximation_rank = matrix_approximation_rank + self.rng = np.random.RandomState(random_seed) def powerSGD_hook( @@ -92,25 +94,20 @@ def powerSGD_hook( input_tensor[total_length:padded_total_length].fill_(0) matrix = input_tensor.view(square_side_length, square_side_length) - def create_low_rank_tensor(fill_random_values): + def create_low_rank_tensor(fill_random_values, rng): "Returns a low-rank 2D tensor of square_side_length * matrix_approximation_rank." if fill_random_values: - with torch.random.fork_rng(devices=[device]): - # The seed makes sure that the initial random values are the same across all the DDP replicas. - # Such seed should differ at every step. - # Currently use the length of input tensor as the seed, which should be mostly different. - # TODO(wayi@): Should read the random seed from the state of this hook provided by the constructor. - torch.manual_seed(total_length) - return torch.randn( - square_side_length, state.matrix_approximation_rank, device=device - ) + torch.manual_seed(rng.randint(1_000_000_000)) + return torch.randn( + square_side_length, state.matrix_approximation_rank, device=device + ) else: return torch.empty( square_side_length, state.matrix_approximation_rank, device=device ) - p = create_low_rank_tensor(fill_random_values=False) - q = create_low_rank_tensor(fill_random_values=True) + p = create_low_rank_tensor(fill_random_values=False, rng=state.rng) + q = create_low_rank_tensor(fill_random_values=True, rng=state.rng) _orthogonalize(q, 0) torch.matmul(matrix, q, out=p) From 563914da1e68bd756e04984bca064e94c3a10274 Mon Sep 17 00:00:00 2001 From: wayi Date: Fri, 27 Nov 2020 00:58:01 -0800 Subject: [PATCH 2/3] Update on "[Gradient Compression] Add a random generator to PowerSGD state for initializing low-rank matrix Q" Previously the random seed is the length of input tensor, which is not guaranteed to be the different for different batches. Now initialize a random generator in PowerSGD state, and use this generator to create a random seed to randomize the low-rank tensor Q at every step. Therefore, the initial tensor Q should be the same across all the replicas at the same step, but different at different steps. 'torch.manual_seed' is used in the same way as https://github.com/epfml/powersgd/blob/master/gradient_reducers.py#L675 Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 Differential Revision: [D25191589](https://our.internmc.facebook.com/intern/diff/D25191589/) [ghstack-poisoned] --- .../ddp_comm_hooks/powerSGD_hook.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 85d4fe553b52..d9b0c3ff91f1 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -48,14 +48,15 @@ def powerSGD_hook( Once gradient tensors are aggregated across all workers, this hook applies compression as follows: 1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings; - 2) Decomposes M into two low-rank tensors P and Q, + 2) Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; - 2) Allreduces P; - 3) Orthogonizes P; - 4) Compute Q, which is approximately equal to M^TP; - 5) Allreduces Q; - 6) Computes M, which is approximately equal to PQ^T. - 7) Truncates the input tensor to the original length. + 2) Computes P, which is equal to MQ; + 3) Allreduces P; + 4) Orthogonizes P; + 5) Computes Q, which is approximately equal to M^TP; + 6) Allreduces Q; + 7) Computes M, which is approximately equal to PQ^T. + 8) Truncates the input tensor to the original length. TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration -- one left multiplication and one right multiplication. @@ -97,10 +98,15 @@ def powerSGD_hook( def create_low_rank_tensor(fill_random_values, rng): "Returns a low-rank 2D tensor of square_side_length * matrix_approximation_rank." if fill_random_values: - torch.manual_seed(rng.randint(1_000_000_000)) - return torch.randn( - square_side_length, state.matrix_approximation_rank, device=device - ) + with torch.random.fork_rng(devices=[]): + # The seed makes sure that the initial random values are the same across all the DDP replicas. + # Such seed should differ at every step. + # Since it is very slow to fork RNG state across all the CUDA devices, + # only fork on CPU and then move the generated tensor to the CUDA device. + torch.manual_seed(rng.randint(1_000_000_000)) + return torch.randn( + square_side_length, state.matrix_approximation_rank, device="cpu" + ).to(device) else: return torch.empty( square_side_length, state.matrix_approximation_rank, device=device From 973be1cdace5ebc7efec879b2f4b557edb39abc0 Mon Sep 17 00:00:00 2001 From: wayi Date: Mon, 30 Nov 2020 12:02:50 -0800 Subject: [PATCH 3/3] Update on "[Gradient Compression] Add a random generator to PowerSGD state for initializing low-rank matrix Q" Previously the random seed is the length of input tensor, which is not guaranteed to be the different for different batches. Now initialize a random generator in PowerSGD state, and use this generator to create a random seed to randomize the low-rank tensor Q at every step. Therefore, the initial tensor Q should be the same across all the replicas at the same step, but different at different steps. 'torch.manual_seed' is used in the same way as https://github.com/epfml/powersgd/blob/master/gradient_reducers.py#L675 Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 Differential Revision: [D25191589](https://our.internmc.facebook.com/intern/diff/D25191589/) [ghstack-poisoned] --- .../algorithms/ddp_comm_hooks/powerSGD_hook.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 38cd433009e4..9a6fbb4a31dd 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -35,9 +35,11 @@ class PowerSGDState(object): def __init__(self, process_group, matrix_approximation_rank=1, random_seed=0): self.process_group = process_group self.matrix_approximation_rank = matrix_approximation_rank - # The purpose of RNG is to generate different random seed for initializing Q across iterations, but in the same order for all replicas. - # Different random seeds across iterations means different 'projections' of the gradients at different SGD steps. - # If the same random projection is used, there will be differences between the gradients that are never synchronized. + # The purpose of this RNG is to generate different random seeds for initializing Q across iterations, + # but in the same order for all the DDP replicas. + # Different random seeds across iterations indicate different 'projections' of the gradients at different SGD steps. + # If the same random projection is used, + # there will be differences between the gradients that are never synchronized. self.rng = np.random.RandomState(random_seed) @@ -102,7 +104,7 @@ def create_low_rank_tensor(fill_random_values, rng): "Returns a low-rank 2D tensor of square_side_length * matrix_approximation_rank." if fill_random_values: with torch.random.fork_rng(devices=[]): - # Fork this RNG to avoid chaning the seed globally and affecting the random sampling anywhere else in the training. + # Fork this RNG to avoid changing the seed globally and affecting the random sampling anywhere else in the training. # The seed makes sure that the initial random values are the same across all the DDP replicas. # Such seed should differ at every step. # Since it is very slow to fork RNG state across all the CUDA devices,