/
__init__.py
265 lines (228 loc) · 12.9 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""
``torch.autograd`` provides classes and functions implementing automatic
differentiation of arbitrary scalar valued functions. It requires minimal
changes to the existing code - you only need to declare :class:`Tensor` s
for which gradients should be computed with the ``requires_grad=True`` keyword.
As of now, we only support autograd for floating point :class:`Tensor` types (
half, float, double and bfloat16) and complex :class:`Tensor` types (cfloat, cdouble).
"""
import torch
import warnings
from torch.types import _TensorOrTensors
from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
from .variable import Variable
from .function import Function, NestedIOFunction
from .gradcheck import gradcheck, gradgradcheck
from .grad_mode import no_grad, enable_grad, set_grad_enabled, inference_mode
from .anomaly_mode import detect_anomaly, set_detect_anomaly
from ..overrides import has_torch_function, handle_torch_function
from . import functional
from . import forward_ad
__all__ = ['Variable', 'Function', 'backward', 'grad_mode']
_OptionalTensor = Optional[torch.Tensor]
def _make_grads(outputs: Sequence[torch.Tensor], grads: Sequence[_OptionalTensor]) -> Tuple[_OptionalTensor, ...]:
new_grads: List[_OptionalTensor] = []
for out, grad in zip(outputs, grads):
if isinstance(grad, torch.Tensor):
if not out.shape == grad.shape:
raise RuntimeError("Mismatch in shape: grad_output["
+ str(grads.index(grad)) + "] has a shape of "
+ str(grad.shape) + " and output["
+ str(outputs.index(out)) + "] has a shape of "
+ str(out.shape) + ".")
if out.dtype.is_complex != grad.dtype.is_complex:
raise RuntimeError("For complex Tensors, both grad_output and output"
" are required to have the same dtype."
" Mismatch in dtype: grad_output["
+ str(grads.index(grad)) + "] has a dtype of "
+ str(grad.dtype) + " and output["
+ str(outputs.index(out)) + "] has a dtype of "
+ str(out.dtype) + ".")
new_grads.append(grad)
elif grad is None:
if out.requires_grad:
if out.numel() != 1:
raise RuntimeError("grad can be implicitly created only for scalar outputs")
new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format))
else:
new_grads.append(None)
else:
raise TypeError("gradients can be either Tensors or None, but got " +
type(grad).__name__)
return tuple(new_grads)
def _tensor_or_tensors_to_tuple(tensors: Optional[_TensorOrTensors], length: int) -> Tuple[_OptionalTensor, ...]:
if tensors is None:
return (None, ) * length
if isinstance(tensors, torch.Tensor):
return (tensors, )
return tuple(tensors)
def backward(
tensors: _TensorOrTensors,
grad_tensors: Optional[_TensorOrTensors] = None,
retain_graph: Optional[bool] = None,
create_graph: bool = False,
grad_variables: Optional[_TensorOrTensors] = None,
inputs: Optional[_TensorOrTensors] = None,
) -> None:
r"""Computes the sum of gradients of given tensors with respect to graph
leaves.
The graph is differentiated using the chain rule. If any of ``tensors``
are non-scalar (i.e. their data has more than one element) and require
gradient, then the Jacobian-vector product would be computed, in this
case the function additionally requires specifying ``grad_tensors``.
It should be a sequence of matching length, that contains the "vector"
in the Jacobian-vector product, usually the gradient of the differentiated
function w.r.t. corresponding tensors (``None`` is an acceptable value for
all tensors that don't need gradient tensors).
This function accumulates gradients in the leaves - you might need to zero
``.grad`` attributes or set them to ``None`` before calling it.
See :ref:`Default gradient layouts<default-grad-layouts>`
for details on the memory layout of accumulated gradients.
.. note::
Using this method with ``create_graph=True`` will create a reference cycle
between the parameter and its gradient which can cause a memory leak.
We recommend using ``autograd.grad`` when creating the graph to avoid this.
If you have to use this function, make sure to reset the ``.grad`` fields of your
parameters to ``None`` after use to break the cycle and avoid the leak.
.. note::
If you run any forward ops, create ``grad_tensors``, and/or call ``backward``
in a user-specified CUDA stream context, see
:ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
Args:
tensors (Sequence[Tensor] or Tensor): Tensors of which the derivative will be
computed.
grad_tensors (Sequence[Tensor or None] or Tensor, optional): The "vector" in
the Jacobian-vector product, usually gradients w.r.t. each element of
corresponding tensors. None values can be specified for scalar Tensors or
ones that don't require grad. If a None value would be acceptable for all
grad_tensors, then this argument is optional.
retain_graph (bool, optional): If ``False``, the graph used to compute the grad
will be freed. Note that in nearly all cases setting this option to ``True``
is not needed and often can be worked around in a much more efficient
way. Defaults to the value of ``create_graph``.
create_graph (bool, optional): If ``True``, graph of the derivative will
be constructed, allowing to compute higher order derivative products.
Defaults to ``False``.
inputs (Sequence[Tensor] or Tensor, optional): Inputs w.r.t. which the gradient
be will accumulated into ``.grad``. All other Tensors will be ignored. If
not provided, the gradient is accumulated into all the leaf Tensors that
were used to compute the attr::tensors. All the provided inputs must be leaf
Tensors.
"""
if grad_variables is not None:
warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
if grad_tensors is None:
grad_tensors = grad_variables
else:
raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
"arguments both passed to backward(). Please only "
"use 'grad_tensors'.")
if inputs is not None and len(inputs) == 0:
raise RuntimeError("'inputs' argument to backward() cannot be empty.")
tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \
tuple(inputs) if inputs is not None else tuple()
grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
grad_tensors_ = _make_grads(tensors, grad_tensors_)
if retain_graph is None:
retain_graph = create_graph
Variable._execution_engine.run_backward(
tensors, grad_tensors_, retain_graph, create_graph, inputs,
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
def grad(
outputs: _TensorOrTensors,
inputs: _TensorOrTensors,
grad_outputs: Optional[_TensorOrTensors] = None,
retain_graph: Optional[bool] = None,
create_graph: bool = False,
only_inputs: bool = True,
allow_unused: bool = False
) -> Tuple[torch.Tensor, ...]:
r"""Computes and returns the sum of gradients of outputs with respect to
the inputs.
``grad_outputs`` should be a sequence of length matching ``output``
containing the "vector" in Jacobian-vector product, usually the pre-computed
gradients w.r.t. each of the outputs. If an output doesn't require_grad,
then the gradient can be ``None``).
If ``only_inputs`` is ``True``, the function will only return a list of gradients
w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining
leaves will still be computed, and will be accumulated into their ``.grad``
attribute.
.. note::
If you run any forward ops, create ``grad_outputs``, and/or call ``grad``
in a user-specified CUDA stream context, see
:ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
Args:
outputs (sequence of Tensor): outputs of the differentiated function.
inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
returned (and not accumulated into ``.grad``).
grad_outputs (sequence of Tensor): The "vector" in the Jacobian-vector product.
Usually gradients w.r.t. each output. None values can be specified for scalar
Tensors or ones that don't require grad. If a None value would be acceptable
for all grad_tensors, then this argument is optional. Default: None.
retain_graph (bool, optional): If ``False``, the graph used to compute the grad
will be freed. Note that in nearly all cases setting this option to ``True``
is not needed and often can be worked around in a much more efficient
way. Defaults to the value of ``create_graph``.
create_graph (bool, optional): If ``True``, graph of the derivative will
be constructed, allowing to compute higher order derivative products.
Default: ``False``.
allow_unused (bool, optional): If ``False``, specifying inputs that were not
used when computing outputs (and therefore their grad is always zero)
is an error. Defaults to ``False``.
"""
outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
overridable_args = outputs + inputs
if has_torch_function(overridable_args):
return handle_torch_function(
grad,
overridable_args,
outputs,
inputs,
grad_outputs=grad_outputs,
retain_graph=retain_graph,
create_graph=create_graph,
only_inputs=only_inputs,
allow_unused=allow_unused,
)
if not only_inputs:
warnings.warn("only_inputs argument is deprecated and is ignored now "
"(defaults to True). To accumulate gradient for other "
"parts of the graph, please use torch.autograd.backward.")
grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs))
grad_outputs_ = _make_grads(outputs, grad_outputs_)
if retain_graph is None:
retain_graph = create_graph
return Variable._execution_engine.run_backward(
outputs, grad_outputs_, retain_graph, create_graph,
inputs, allow_unused, accumulate_grad=False)
# This function applies in case of gradient checkpointing for memory
# optimization. Currently, gradient checkpointing is supported only if the
# execution engine is invoked through torch.autograd.backward() and its
# inputs argument is not passed. It is not supported for torch.autograd.grad().
# This is because if inputs are specified, the gradient won't be calculated for
# anything else e.g. model parameters like weights, bias etc.
#
# This function returns whether the checkpointing is valid i.e. torch.autograd.backward
# or not i.e. torch.autograd.grad. The implementation works by maintaining a thread
# local variable in torch/csrc/autograd/engine.cpp which looks at the NodeTask
# in the stack and before a NodeTask is executed in evaluate_function, it
# checks for whether reentrant backwards is imperative or not.
# See https://github.com/pytorch/pytorch/pull/4594 for more discussion/context
def _is_checkpoint_valid():
return Variable._execution_engine.is_checkpoint_valid()
def variable(*args, **kwargs):
warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
return torch.tensor(*args, **kwargs)
if not torch._C._autograd_init():
raise RuntimeError("autograd initialization failed")
# Import all native method/classes
from torch._C._autograd import (DeviceType, ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
_enable_profiler_legacy, _disable_profiler_legacy, _profiler_enabled,
_enable_record_function, _set_empty_test_observer, kineto_available,
_supported_kineto_activities, _add_metadata_json)
if kineto_available():
from torch._C._autograd import (_ProfilerResult, _KinetoEvent,
_prepare_profiler, _enable_profiler, _disable_profiler)
from . import profiler