From f7398759b48340f27e59d25db9dbbf3f1da5f13e Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 13 Oct 2020 21:09:57 -0700 Subject: [PATCH] Only populate grad accumulator to var mapping for find_unused_parameters=True in DDP (#45942) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45942 We only need to keep track of this for traversing the autograd graph when find_unused_parameters=True. Without that, we populate and keep this mapping in memory, which occupies sizeof(pointer) * number of grad accumulators of extra memory. ghstack-source-id: 114219289 Test Plan: CI Reviewed By: mrshenli Differential Revision: D24154407 fbshipit-source-id: 220d723e262f36590a03a3fd2dab47cbfdb87d40 --- torch/csrc/distributed/c10d/reducer.cpp | 9 ++++++--- torch/csrc/distributed/c10d/reducer.h | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 53541960f300..74942d1c77d8 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -117,8 +117,11 @@ Reducer::Reducer( // Map raw function pointer to replica index and parameter index. // This is used later on when the autograd graph is traversed - // to check for parameters for which no gradient is computed. - func_[grad_accumulator.get()] = index; + // to check for parameters for which no gradient is computed, if + // find_unused_parameters=True. + if (find_unused_parameters_) { + gradAccToVariableMap_[grad_accumulator.get()] = index; + } // The gradient accumulator is stored as weak_ptr in the autograd // metadata of the variable, so we have to keep it alive here for @@ -991,7 +994,7 @@ void Reducer::prepare_for_backward( } // Find accumulator functions that don't show up in this graph. - for (const auto& it : func_) { + for (const auto& it : gradAccToVariableMap_) { // If the accumulator function is present in the graph, we know // a gradient will be computed for the corresponding parameter. if (seen.count(it.first) > 0) { diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index efb2060a5533..29bdace7ce00 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -122,7 +122,8 @@ class Reducer { std::vector>> grad_accumulators_; - std::unordered_map func_; + std::unordered_map + gradAccToVariableMap_; std::vector>> hooks_;