pytorch · ftramer · May 13, 2020 · May 13, 2020 · May 13, 2020 · May 13, 2020
diff --git a/examples/imagenet.py b/examples/imagenet.py
@@ -3,7 +3,6 @@
 
 """
 Runs ImageNet training with differential privacy.
-
 """
 
 import argparse
@@ -48,7 +47,7 @@
 
 # The following lines enable stat gathering for the clipping process
 # and set a default of per layer clipping for the Privacy Engine
-clipping = {"clip_per_layer": True, "enable_stat": True}
+clipping = {"clip_per_layer": False, "enable_stat": True}
 
 parser = argparse.ArgumentParser(description="PyTorch ImageNet DP Training")
 parser.add_argument("data", metavar="DIR", help="path to dataset")
@@ -80,6 +79,14 @@
     "batch size of all GPUs on the current node when "
     "using Data Parallel or Distributed Data Parallel",
 )
+parser.add_argument(
+    "-na",
+    "--n_accumulation_steps",
+    default=1,
+    type=int,
+    metavar="N",
+    help="number of mini-batches to accumulate into an effective batch for SGD",
+)
 parser.add_argument(
     "--lr",
     "--learning-rate",
@@ -371,7 +378,7 @@ def main_worker(gpu, ngpus_per_node, args):
         print("PRIVACY ENGINE ON")
         privacy_engine = PrivacyEngine(
             model,
-            batch_size=args.batch_size,
+            batch_size=args.batch_size * args.n_accumulation_steps,
             sample_size=len(train_dataset),
             alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
             noise_multiplier=args.sigma,
@@ -455,7 +462,14 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
         # compute gradient and do SGD step
         optimizer.zero_grad()
         loss.backward()
-        optimizer.step()
+
+        if args.n_accumulation_steps > 1:
+            optimizer.virtual_step()
+
+        # make sure we take a step after processing the last mini-batch in the
+        # epoch to ensure we start the next epoch with a clean state
+        if ((i + 1) % args.n_accumulation_steps == 0) or ((i + 1) == len(train_loader)):
+            optimizer.step()
 
         # measure elapsed time
         batch_time.update(time.time() - end)

diff --git a/torchdp/autograd_grad_sample.py b/torchdp/autograd_grad_sample.py
@@ -6,7 +6,7 @@
 Original license is Unlicense. We put it here for user's convenience, with
 the author's permission.
 """
-
+from functools import partial
 from typing import List
 
 import torch
@@ -23,24 +23,42 @@
 _enforce_fresh_backprop: bool = False
 
 
-def add_hooks(model: nn.Module) -> None:
+def add_hooks(model: nn.Module, loss_type: str = "mean", batch_dim: int = 0) -> None:
     """
     Adds hooks to model to save activations and backprop values.
     The hooks will
     1. save activations into param.activations during forward pass
-    2. append backprops to params.backprops_list during backward pass.
+    2. compute per-sample gradients in params.grad_sample during backward pass.
     Call "remove_hooks(model)" to disable this.
     Args:
-        model:
+        model: the model to which hooks are added
+        loss_type: either "mean" or "sum" depending on whether backpropped
+        loss was averaged or summed over batch (default: "mean")
+        batch_dim: the batch dimension (default: 0)
     """
+    if hasattr(model, "autograd_grad_sample_hooks"):
+        raise ValueError("Trying to add hooks twice to the same model")
+
     global _hooks_disabled
     _hooks_disabled = False
 
+    if loss_type not in ("sum", "mean"):
+        raise ValueError(
+            f"loss_type = {loss_type}. Only 'sum' and 'mean' losses are supported"
+        )
+
     handles = []
     for layer in model.modules():
         if get_layer_type(layer) in _supported_layers_grad_samplers.keys():
             handles.append(layer.register_forward_hook(_capture_activations))
-            handles.append(layer.register_backward_hook(_capture_backprops))
+
+            handles.append(
+                layer.register_backward_hook(
+                    partial(
+                        _capture_backprops, loss_type=loss_type, batch_dim=batch_dim
+                    )
+                )
+            )
 
     model.__dict__.setdefault("autograd_grad_sample_hooks", []).extend(handles)
 
@@ -88,82 +106,60 @@ def _capture_activations(
     layer.activations = input[0].detach()
 
 
-def _capture_backprops(layer: nn.Module, _input, output):
-    """Append backprop to layer.backprops_list in backward pass."""
-    global _enforce_fresh_backprop
+def _capture_backprops(
+    layer: nn.Module,
+    _input: torch.Tensor,
+    output: torch.Tensor,
+    loss_type: str,
+    batch_dim: int,
+):
+    """Capture backprops in backward pass and store per-sample gradients."""
 
     if _hooks_disabled:
         return
 
-    if _enforce_fresh_backprop:
-        if hasattr(layer, "backprops_list"):
-            raise ValueError(
-                f"Seeing result of previous backprop, "
-                f"use clear_backprops(model) to clear"
-            )
-        _enforce_fresh_backprop = False
-
-    if not hasattr(layer, "backprops_list"):
-        layer.backprops_list = []
-    layer.backprops_list.append(output[0].detach())
-
-
-def clear_backprops(model: nn.Module) -> None:
-    """Delete layer.backprops_list in every layer."""
-    for layer in model.modules():
-        if hasattr(layer, "backprops_list"):
-            del layer.backprops_list
-
-
-def _check_layer_sanity(layer):
-    if not hasattr(layer, "activations"):
-        raise ValueError(
-            f"No activations detected for {type(layer)},"
-            " run forward after add_hooks(model)"
-        )
-    if not hasattr(layer, "backprops_list"):
-        raise ValueError("No backprops detected, run backward after add_hooks(model)")
-    if len(layer.backprops_list) != 1:
-        raise ValueError(
-            "Multiple backprops detected, make sure to call clear_backprops(model)"
-        )
+    backprops = output[0].detach()
+    _compute_grad_sample(layer, backprops, loss_type, batch_dim)
 
 
-def compute_grad_sample(
-    model: nn.Module, loss_type: str = "mean", batch_dim: int = 0
+def _compute_grad_sample(
+    layer: nn.Module, backprops: torch.Tensor, loss_type: str, batch_dim: int
 ) -> None:
     """
     Compute per-example gradients and save them under 'param.grad_sample'.
     Must be called after loss.backprop()
     Args:
-        model:
-        loss_type: either "mean" or "sum" depending whether backpropped
+        layer: the layer for which per-sample gradients are computed
+        backprops: the captured backprops
+        loss_type: either "mean" or "sum" depending on whether backpropped
         loss was averaged or summed over batch
+        batch_dim: the batch dimension
     """
-    if loss_type not in ("sum", "mean"):
-        raise ValueError(f"loss_type = {loss_type}. Only 'sum' and 'mean' supported")
-    for layer in model.modules():
-        layer_type = get_layer_type(layer)
-        if (
-            not requires_grad(layer)
-            or layer_type not in _supported_layers_grad_samplers.keys()
-        ):
-            continue
-
-        _check_layer_sanity(layer)
-
-        A = layer.activations
-        n = A.shape[batch_dim]
-        if loss_type == "mean":
-            B = layer.backprops_list[0] * n
-        else:  # loss_type == 'sum':
-            B = layer.backprops_list[0]
-        # rearrange the blob dimensions
-        if batch_dim != 0:
-            A = A.permute([batch_dim] + [x for x in range(A.dim()) if x != batch_dim])
-            B = B.permute([batch_dim] + [x for x in range(B.dim()) if x != batch_dim])
-        # compute grad sample for  individual layers
-        compute_layer_grad_sample = _supported_layers_grad_samplers.get(
-            get_layer_type(layer)
+    layer_type = get_layer_type(layer)
+    if (
+        not requires_grad(layer)
+        or layer_type not in _supported_layers_grad_samplers.keys()
+    ):
+        return
+
+    if not hasattr(layer, "activations"):
+        raise ValueError(
+            f"No activations detected for {type(layer)},"
+            " run forward after add_hooks(model)"
         )
-        compute_layer_grad_sample(layer, A, B)
+
+    A = layer.activations
+    n = A.shape[batch_dim]
+    if loss_type == "mean":
+        B = backprops * n
+    else:  # loss_type == 'sum':
+        B = backprops
+    # rearrange the blob dimensions
+    if batch_dim != 0:
+        A = A.permute([batch_dim] + [x for x in range(A.dim()) if x != batch_dim])
+        B = B.permute([batch_dim] + [x for x in range(B.dim()) if x != batch_dim])
+    # compute grad sample for  individual layers
+    compute_layer_grad_sample = _supported_layers_grad_samplers.get(
+        get_layer_type(layer)
+    )
+    compute_layer_grad_sample(layer, A, B)