From f7398759b48340f27e59d25db9dbbf3f1da5f13e Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 13 Oct 2020 21:09:57 -0700
Subject: [PATCH] Only populate grad accumulator to var mapping for
 find_unused_parameters=True in DDP (#45942)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45942

We only need to keep track of this for traversing the autograd graph
when find_unused_parameters=True. Without that, we populate and keep this
mapping in memory, which occupies sizeof(pointer) * number of grad accumulators
of extra memory.
ghstack-source-id: 114219289

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D24154407

fbshipit-source-id: 220d723e262f36590a03a3fd2dab47cbfdb87d40
---
 torch/csrc/distributed/c10d/reducer.cpp | 9 ++++++---
 torch/csrc/distributed/c10d/reducer.h   | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 53541960f300..74942d1c77d8 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -117,8 +117,11 @@ Reducer::Reducer(
 
         // Map raw function pointer to replica index and parameter index.
         // This is used later on when the autograd graph is traversed
-        // to check for parameters for which no gradient is computed.
-        func_[grad_accumulator.get()] = index;
+        // to check for parameters for which no gradient is computed, if
+        // find_unused_parameters=True.
+        if (find_unused_parameters_) {
+          gradAccToVariableMap_[grad_accumulator.get()] = index;
+        }
 
         // The gradient accumulator is stored as weak_ptr in the autograd
         // metadata of the variable, so we have to keep it alive here for
@@ -991,7 +994,7 @@ void Reducer::prepare_for_backward(
   }
 
   // Find accumulator functions that don't show up in this graph.
-  for (const auto& it : func_) {
+  for (const auto& it : gradAccToVariableMap_) {
     // If the accumulator function is present in the graph, we know
     // a gradient will be computed for the corresponding parameter.
     if (seen.count(it.first) > 0) {
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index efb2060a5533..29bdace7ce00 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -122,7 +122,8 @@ class Reducer {
 
   std::vector<std::vector<std::shared_ptr<torch::autograd::Node>>>
       grad_accumulators_;
-  std::unordered_map<torch::autograd::Node*, VariableIndex> func_;
+  std::unordered_map<torch::autograd::Node*, VariableIndex>
+      gradAccToVariableMap_;
   std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Node>>>
       hooks_;