From 012b88d7a2d80de99b2b580a3a67b052799cfccc Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu Return a bool indicating if autocast is available on Instances of In these regions, ops run in an op-specific dtype chosen by autocast
@@ -4632,7 +4632,7 @@ Create a helper decorator for Autograd functions are subclasses of Create a helper decorator for backward methods of custom autograd functions. Autograd functions are subclasses of See See See See Base class to create custom autograd.Function. To create a custom autograd.Function, subclass this class and implement
the Automatic Mixed Precision package - torch.amp
Autocasting#
device_type.
Automatic Mixed Precision package - torch.amp
autocast serve as context managers or decorators that
allow regions of your script to run in mixed precision.Automatic Mixed Precision package - torch.amp
forward methods of custom autograd functions.torch.autograd.Function.
See the example page for more detail.Automatic Mixed Precision package - torch.amp
torch.autograd.Function.
Ensures that backward executes with the same autocast state as forward.
@@ -4675,7 +4675,7 @@ Automatic Mixed Precision package - torch.amp
torch.autocast.torch.cuda.amp.autocast(args...) is deprecated. Please use torch.amp.autocast("cuda", args...) instead.
@@ -4684,21 +4684,21 @@
Automatic Mixed Precision package - torch.amp
torch.cuda.amp.custom_fwd(args...) is deprecated. Please use
torch.amp.custom_fwd(args..., device_type='cuda') instead.
torch.cuda.amp.custom_bwd(args...) is deprecated. Please use
torch.amp.custom_bwd(args..., device_type='cuda') instead.
torch.autocast.
torch.cpu.amp.autocast(args...) is deprecated. Please use torch.amp.autocast("cpu", args...) instead.
@@ -4729,7 +4729,7 @@
Automatic Mixed Precision package - torch.amp
torch.amp.GradScaler.
torch.cuda.amp.GradScaler(args...) is deprecated. Please use torch.amp.GradScaler("cuda", args...) instead.
@@ -4738,7 +4738,7 @@
Automatic Mixed Precision package - torch.amp
torch.amp.GradScaler.
torch.cpu.amp.GradScaler(args...) is deprecated. Please use torch.amp.GradScaler("cpu", args...) instead.
diff --git a/2.9/autograd.html b/2.9/autograd.html
index 05fb3a8eeff..e23293911a9 100644
--- a/2.9/autograd.html
+++ b/2.9/autograd.html
@@ -4629,7 +4629,7 @@
Tensor autograd functionsFunction#
forward() and backward() static methods. Then, to use your custom
@@ -4772,7 +4772,7 @@ Profiler
emit_itt.
Context manager that manages autograd profiler state and holds a summary of results.
Note
@@ -4907,7 +4907,7 @@Context manager that makes every autograd operation emit an NVTX range.
It is useful when running the program under nvprof:
nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
@@ -4976,7 +4976,7 @@ Profiler
-
-class torch.autograd.profiler.emit_itt(enabled=True, record_shapes=False)[source]#
+class torch.autograd.profiler.emit_itt(enabled=True, record_shapes=False)[source]#
Context manager that makes every autograd operation emit an ITT range.
It is useful when running the program under Intel(R) VTune Profiler:
vtune <--vtune-flags> <regular command here>
@@ -5027,7 +5027,7 @@ Profiler#
-
-class torch.autograd.detect_anomaly(check_nan=True)[source]#
+class torch.autograd.detect_anomaly(check_nan=True)[source]#
Context-manager that enable anomaly detection for the autograd engine.
This does two things:
@@ -5098,7 +5098,7 @@ Debugging and anomaly detection
-
-class torch.autograd.set_detect_anomaly(mode, check_nan=True)[source]#
+class torch.autograd.set_detect_anomaly(mode, check_nan=True)[source]#
Context-manager that sets the anomaly detection for the autograd engine on or off.
set_detect_anomaly will enable or disable the autograd anomaly detection
based on its argument mode.
@@ -5184,7 +5184,7 @@
Autograd graphHooks for saved tensors.
-
-class torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook)[source]#
+class torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook)[source]#
Context-manager that sets a pair of pack / unpack hooks for saved tensors.
Use this context-manager to define how intermediary results of an operation
should be packed before saving, and unpacked on retrieval.
@@ -5251,7 +5251,7 @@ Autograd graph
-
-class torch.autograd.graph.save_on_cpu(pin_memory=False, device_type='cuda')[source]#
+class torch.autograd.graph.save_on_cpu(pin_memory=False, device_type='cuda')[source]#
Context manager under which tensors saved by the forward pass will be stored on cpu, then retrieved for backward.
When performing operations within this context manager, intermediary
results saved in the graph during the forward pass will be moved to CPU,
@@ -5291,7 +5291,7 @@
Autograd graph
-
-class torch.autograd.graph.disable_saved_tensors_hooks(error_message)[source]#
+class torch.autograd.graph.disable_saved_tensors_hooks(error_message)[source]#
Context-manager that disables the saved tensors default hooks feature.
Useful for if you are creating a feature that does not work with saved
tensors default hooks.
@@ -5317,7 +5317,7 @@ Autograd graph
-
-class torch.autograd.graph.register_multi_grad_hook(tensors, fn, *, mode='all')[source]#
+class torch.autograd.graph.register_multi_grad_hook(tensors, fn, *, mode='all')[source]#
Register a multi-grad backward hook.
There are two supported modes: "all" and "any".
Under the "all" mode, the hook will be called after gradients with respect to every tensor in
@@ -5368,7 +5368,7 @@
Autograd graph
-
-class torch.autograd.graph.allow_mutation_on_saved_tensors[source]#
+class torch.autograd.graph.allow_mutation_on_saved_tensors[source]#
Context manager under which mutating tensors saved for backward is allowed.
Under this context manager, tensors saved for backward are cloned on mutation,
so the original version can still be used during backward. Normally, mutating a tensor
@@ -5404,7 +5404,7 @@
Autograd graph
-
-class torch.autograd.graph.GradientEdge(node, output_nr, ownership_token=None)[source]#
+class torch.autograd.graph.GradientEdge(node, output_nr, ownership_token=None)[source]#
Object representing a given gradient edge within the autograd graph.
To get the gradient edge where a given Tensor gradient will be computed,
you can do edge = autograd.graph.get_gradient_edge(tensor).
@@ -5414,7 +5414,7 @@ Autograd graph
-
-torch.autograd.graph.get_gradient_edge(tensor)[source]#
+torch.autograd.graph.get_gradient_edge(tensor)[source]#
Get the gradient edge for computing the gradient of the given Tensor.
In particular, it is equivalent to call
g = autograd.grad(loss, input) and g = autograd.grad(loss, get_gradient_edge(input)).
diff --git a/2.9/backends.html b/2.9/backends.html
index 165e4a1d583..60b84c60e45 100644
--- a/2.9/backends.html
+++ b/2.9/backends.html
@@ -4414,7 +4414,7 @@
torch.backends.cpu#
-
-torch.backends.cpu.get_cpu_capability()[source]#
+torch.backends.cpu.get_cpu_capability()[source]#
Return cpu capability as a string value.
Possible values:
- “DEFAULT”
@@ -4436,7 +4436,7 @@
torch.backends.cuda#
-
-torch.backends.cuda.is_built()[source]#
+torch.backends.cuda.is_built()[source]#
Return whether PyTorch is built with CUDA support.
Note that this doesn’t necessarily mean CUDA is available; just that if this PyTorch
binary were run on a machine with working CUDA drivers and devices, we would be able to use it.
@@ -4488,7 +4488,7 @@
-
-torch.backends.cuda.preferred_blas_library(backend=None)[source]#
+torch.backends.cuda.preferred_blas_library(backend=None)[source]#
Override the library PyTorch uses for BLAS operations. Choose between cuBLAS, cuBLASLt, and CK [ROCm-only].
Warning
@@ -4521,7 +4521,7 @@
-
-torch.backends.cuda.preferred_rocm_fa_library(backend=None)[source]#
+torch.backends.cuda.preferred_rocm_fa_library(backend=None)[source]#
[ROCm-only]
Override the backend PyTorch uses in ROCm environments for Flash Attention. Choose between AOTriton and CK
@@ -4551,7 +4551,7 @@
-
-torch.backends.cuda.preferred_linalg_library(backend=None)[source]#
+torch.backends.cuda.preferred_linalg_library(backend=None)[source]#
Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
Warning
@@ -4606,7 +4606,7 @@
-
-torch.backends.cuda.flash_sdp_enabled()[source]#
+torch.backends.cuda.flash_sdp_enabled()[source]#
-
Warning
This flag is beta and subject to change.
@@ -4616,7 +4616,7 @@
-
-torch.backends.cuda.enable_mem_efficient_sdp(enabled)[source]#
+torch.backends.cuda.enable_mem_efficient_sdp(enabled)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4628,7 +4628,7 @@
-
-torch.backends.cuda.mem_efficient_sdp_enabled()[source]#
+torch.backends.cuda.mem_efficient_sdp_enabled()[source]#
-
Warning
This flag is beta and subject to change.
@@ -4638,7 +4638,7 @@
-
-torch.backends.cuda.enable_flash_sdp(enabled)[source]#
+torch.backends.cuda.enable_flash_sdp(enabled)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4650,7 +4650,7 @@
-
-torch.backends.cuda.math_sdp_enabled()[source]#
+torch.backends.cuda.math_sdp_enabled()[source]#
-
Warning
This flag is beta and subject to change.
@@ -4660,7 +4660,7 @@
-
-torch.backends.cuda.enable_math_sdp(enabled)[source]#
+torch.backends.cuda.enable_math_sdp(enabled)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4672,7 +4672,7 @@
-
-torch.backends.cuda.fp16_bf16_reduction_math_sdp_allowed()[source]#
+torch.backends.cuda.fp16_bf16_reduction_math_sdp_allowed()[source]#
-
Warning
This flag is beta and subject to change.
@@ -4682,7 +4682,7 @@
-
-torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(enabled)[source]#
+torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(enabled)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4694,7 +4694,7 @@
-
-torch.backends.cuda.cudnn_sdp_enabled()[source]#
+torch.backends.cuda.cudnn_sdp_enabled()[source]#
-
Warning
This flag is beta and subject to change.
@@ -4704,7 +4704,7 @@
-
-torch.backends.cuda.enable_cudnn_sdp(enabled)[source]#
+torch.backends.cuda.enable_cudnn_sdp(enabled)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4716,7 +4716,7 @@
-
-torch.backends.cuda.is_flash_attention_available()[source]#
+torch.backends.cuda.is_flash_attention_available()[source]#
Check if PyTorch was built with FlashAttention for scaled_dot_product_attention.
- Returns
@@ -4735,7 +4735,7 @@
-
-torch.backends.cuda.can_use_flash_attention(params, debug=False)[source]#
+torch.backends.cuda.can_use_flash_attention(params, debug=False)[source]#
Check if FlashAttention can be utilized in scaled_dot_product_attention.
- Parameters
@@ -4763,7 +4763,7 @@
-
-torch.backends.cuda.can_use_efficient_attention(params, debug=False)[source]#
+torch.backends.cuda.can_use_efficient_attention(params, debug=False)[source]#
Check if efficient_attention can be utilized in scaled_dot_product_attention.
- Parameters
@@ -4791,7 +4791,7 @@
-
-torch.backends.cuda.can_use_cudnn_attention(params, debug=False)[source]#
+torch.backends.cuda.can_use_cudnn_attention(params, debug=False)[source]#
Check if cudnn_attention can be utilized in scaled_dot_product_attention.
- Parameters
@@ -4819,7 +4819,7 @@
-
-torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True, enable_cudnn=True)[source]#
+torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True, enable_cudnn=True)[source]#
-
Warning
This flag is beta and subject to change.
@@ -4835,13 +4835,13 @@
torch.backends.cudnn#
-
-torch.backends.cudnn.version()[source]#
+torch.backends.cudnn.version()[source]#
Return the version of cuDNN.
-
-torch.backends.cudnn.is_available()[source]#
+torch.backends.cudnn.is_available()[source]#
Return a bool indicating if CUDNN is currently available.
@@ -4887,7 +4887,7 @@
torch.backends.cusparselt#
-
-torch.backends.cusparselt.version()[source]#
+torch.backends.cusparselt.version()[source]#
Return the version of cuSPARSELt
- Return type
@@ -4898,7 +4898,7 @@
-
-torch.backends.cusparselt.is_available()[source]#
+torch.backends.cusparselt.is_available()[source]#
Return a bool indicating if cuSPARSELt is currently available.
- Return type
@@ -4912,7 +4912,7 @@
torch.backends.mha#
-
-torch.backends.mha.get_fastpath_enabled()[source]#
+torch.backends.mha.get_fastpath_enabled()[source]#
Returns whether fast path for TransformerEncoder and MultiHeadAttention
is enabled, or True if jit is scripting.
@@ -4929,7 +4929,7 @@
-
-torch.backends.mha.set_fastpath_enabled(value)[source]#
+torch.backends.mha.set_fastpath_enabled(value)[source]#
Sets whether fast path is enabled
@@ -4950,7 +4950,7 @@
torch.backends.mps#
-
-torch.backends.mps.is_available()[source]#
+torch.backends.mps.is_available()[source]#
Return a bool indicating if MPS is currently available.
- Return type
@@ -4961,7 +4961,7 @@
-
-torch.backends.mps.is_built()[source]#
+torch.backends.mps.is_built()[source]#
Return whether PyTorch is built with MPS support.
Note that this doesn’t necessarily mean MPS is available; just that
if this PyTorch binary were run a machine with working MPS drivers
@@ -4978,13 +4978,13 @@
torch.backends.mkl#
-
-torch.backends.mkl.is_available()[source]#
+torch.backends.mkl.is_available()[source]#
Return whether PyTorch is built with MKL support.
-
-class torch.backends.mkl.verbose(enable)[source]#
+class torch.backends.mkl.verbose(enable)[source]#
On-demand oneMKL verbosing functionality.
To make it easier to debug performance issues, oneMKL can dump verbose
messages containing execution information like duration while executing
@@ -5017,12 +5017,12 @@
torch.backends.mkldnn#
-
-class torch.backends.mkldnn.verbose(level)[source]#
+class torch.backends.mkldnn.verbose(level)[source]#
On-demand oneDNN (former MKL-DNN) verbosing functionality.
To make it easier to debug performance issues, oneDNN can dump verbose
messages containing information like kernel size, input data size and
@@ -5056,19 +5056,19 @@
torch.backends.nnpack#
-
-torch.backends.nnpack.is_available()[source]#
+torch.backends.nnpack.is_available()[source]#
Return whether PyTorch is built with NNPACK support.
-
-torch.backends.nnpack.flags(enabled=False)[source]#
+torch.backends.nnpack.flags(enabled=False)[source]#
Context manager for setting if nnpack is enabled globally
-
-torch.backends.nnpack.set_flags(_enabled)[source]#
+torch.backends.nnpack.set_flags(_enabled)[source]#
Set if nnpack is enabled globally
@@ -5077,7 +5077,7 @@
torch.backends.openmp#
-
-torch.backends.openmp.is_available()[source]#
+torch.backends.openmp.is_available()[source]#
Return whether PyTorch is built with OpenMP support.
@@ -5086,7 +5086,7 @@
torch.backends.opt_einsum#
-
-torch.backends.opt_einsum.is_available()[source]#
+torch.backends.opt_einsum.is_available()[source]#
Return a bool indicating if opt_einsum is currently available.
You must install opt-einsum in order for torch to automatically optimize einsum. To
make opt-einsum available, you can install it along with torch: pip install torch[opt-einsum]
@@ -5102,7 +5102,7 @@
-
-torch.backends.opt_einsum.get_opt_einsum()[source]#
+torch.backends.opt_einsum.get_opt_einsum()[source]#
Return the opt_einsum package if opt_einsum is currently available, else None.
- Return type
diff --git a/2.9/benchmark_utils.html b/2.9/benchmark_utils.html
index 7be1a72b03d..ff49ed1613b 100644
--- a/2.9/benchmark_utils.html
+++ b/2.9/benchmark_utils.html
@@ -4396,7 +4396,7 @@
Created On: Nov 02, 2020 | Last Updated On: Jun 12, 2025
-
-class torch.utils.benchmark.Timer(stmt='pass', setup='pass', global_setup='', timer=<built-in function perf_counter>, globals=None, label=None, sub_label=None, description=None, env=None, num_threads=1, language=Language.PYTHON)[source]#
+class torch.utils.benchmark.Timer(stmt='pass', setup='pass', global_setup='', timer=<built-in function perf_counter>, globals=None, label=None, sub_label=None, description=None, env=None, num_threads=1, language=Language.PYTHON)[source]#
Helper class for measuring execution time of PyTorch statements.
For a full tutorial on how to use this class, see:
https://pytorch.org/tutorials/recipes/recipes/benchmark.html
@@ -4500,7 +4500,7 @@
-
-adaptive_autorange(threshold=0.1, *, min_run_time=0.01, max_run_time=10.0, callback=None)[source]#
+adaptive_autorange(threshold=0.1, *, min_run_time=0.01, max_run_time=10.0, callback=None)[source]#
Similar to blocked_autorange but also checks for variablility in measurements
and repeats until iqr/median is smaller than threshold or max_run_time is reached.
At a high level, adaptive_autorange executes the following pseudo-code:
@@ -4541,7 +4541,7 @@
-
-blocked_autorange(callback=None, min_run_time=0.2)[source]#
+blocked_autorange(callback=None, min_run_time=0.2)[source]#
Measure many replicates while keeping timer overhead to a minimum.
At a high level, blocked_autorange executes the following pseudo-code:
`setup`
@@ -4586,7 +4586,7 @@
-
-collect_callgrind(number: int, *, repeats: None, collect_baseline: bool, retain_out_file: bool) CallgrindStats[source]#
+collect_callgrind(number: int, *, repeats: None, collect_baseline: bool, retain_out_file: bool) CallgrindStats[source]#
-
collect_callgrind(number: int, *, repeats: int, collect_baseline: bool, retain_out_file: bool) tuple[torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats, ...]
Collect instruction counts using Callgrind.
@@ -4621,7 +4621,7 @@
-
-timeit(number=1000000)[source]#
+timeit(number=1000000)[source]#
Mirrors the semantics of timeit.Timer.timeit().
Execute the main statement (stmt) number times.
https://docs.python.org/3/library/timeit.html#timeit.Timer.timeit
@@ -4636,7 +4636,7 @@
-
-class torch.utils.benchmark.Measurement(number_per_run, raw_times, task_spec, metadata=None)[source]#
+class torch.utils.benchmark.Measurement(number_per_run, raw_times, task_spec, metadata=None)[source]#
The result of a Timer measurement.
This class stores one or more measurements of a given statement. It is
serializable and provides several convenience methods
@@ -4645,7 +4645,7 @@
-
-static merge(measurements)[source]#
+static merge(measurements)[source]#
Convenience method for merging replicates.
Merge will extrapolate times to number_per_run=1 and will not
transfer any metadata. (Since it might differ between replicates)
@@ -4675,7 +4675,7 @@
-
-class torch.utils.benchmark.CallgrindStats(task_spec, number_per_run, built_with_debug_symbols, baseline_inclusive_stats, baseline_exclusive_stats, stmt_inclusive_stats, stmt_exclusive_stats, stmt_callgrind_out)[source]#
+class torch.utils.benchmark.CallgrindStats(task_spec, number_per_run, built_with_debug_symbols, baseline_inclusive_stats, baseline_exclusive_stats, stmt_inclusive_stats, stmt_exclusive_stats, stmt_callgrind_out)[source]#
Top level container for Callgrind results collected by Timer.
Manipulation is generally done using the FunctionCounts class, which is
obtained by calling CallgrindStats.stats(…). Several convenience
@@ -4685,7 +4685,7 @@
-
-as_standardized()[source]#
+as_standardized()[source]#
Strip library names and some prefixes from function strings.
When comparing two different sets of instruction counts, on stumbling
block can be path prefixes. Callgrind includes the full filepath
@@ -4714,7 +4714,7 @@
-
-counts(*, denoise=False)[source]#
+counts(*, denoise=False)[source]#
Returns the total number of instructions executed.
See FunctionCounts.denoise() for an explanation of the denoise arg.
@@ -4726,7 +4726,7 @@
-
-delta(other, inclusive=False)[source]#
+delta(other, inclusive=False)[source]#
Diff two sets of counts.
One common reason to collect instruction counts is to determine the
the effect that a particular change will have on the number of instructions
@@ -4744,7 +4744,7 @@
-
-stats(inclusive=False)[source]#
+stats(inclusive=False)[source]#
Returns detailed function counts.
Conceptually, the FunctionCounts returned can be thought of as a tuple
of (count, path_and_function_name) tuples.
@@ -4764,7 +4764,7 @@
-
-class torch.utils.benchmark.FunctionCounts(_data, inclusive, truncate_rows=True, _linewidth=None)[source]#
+class torch.utils.benchmark.FunctionCounts(_data, inclusive, truncate_rows=True, _linewidth=None)[source]#
Container for manipulating Callgrind results.
- It supports:
@@ -4781,7 +4781,7 @@
-
-denoise()[source]#
+denoise()[source]#
Remove known noisy instructions.
Several instructions in the CPython interpreter are rather noisy. These
instructions involve unicode to dictionary lookups which Python uses to
@@ -4797,7 +4797,7 @@
-
-filter(filter_fn)[source]#
+filter(filter_fn)[source]#
Keep only the elements where filter_fn applied to function name returns True.
- Return type
@@ -4808,7 +4808,7 @@
-
-transform(map_fn)[source]#
+transform(map_fn)[source]#
Apply map_fn to all of the function names.
This can be used to regularize function names (e.g. stripping irrelevant
parts of the file path), coalesce entries by mapping multiple functions
@@ -4824,7 +4824,7 @@
-
-class torch.utils.benchmark.Compare(results)[source]#
+class torch.utils.benchmark.Compare(results)[source]#
Helper class for displaying the results of many measurements in a
formatted table.
The table format is based on the information fields provided in
@@ -4840,33 +4840,33 @@
-
-colorize(rowwise=False)[source]#
+colorize(rowwise=False)[source]#
Colorize formatted table.
Colorize columnwise by default.
-
-extend_results(results)[source]#
+extend_results(results)[source]#
Append results to already stored ones.
All added results must be instances of Measurement.
-
-highlight_warnings()[source]#
+highlight_warnings()[source]#
Enables warning highlighting when building formatted table.
-
-trim_significant_figures()[source]#
+trim_significant_figures()[source]#
Enables trimming of significant figures when building the formatted table.
diff --git a/2.9/checkpoint.html b/2.9/checkpoint.html
index fb5536ae459..8fe2d746f6c 100644
--- a/2.9/checkpoint.html
+++ b/2.9/checkpoint.html
@@ -4426,7 +4426,7 @@ torch.utils.checkpoint
-
-torch.utils.checkpoint.checkpoint(function, *args, use_reentrant=None, context_fn=<function noop_context_fn>, determinism_check='default', debug=False, early_stop=True, **kwargs)[source]#
+torch.utils.checkpoint.checkpoint(function, *args, use_reentrant=None, context_fn=<function noop_context_fn>, determinism_check='default', debug=False, early_stop=True, **kwargs)[source]#
Checkpoint a model or part of the model.
Activation checkpointing is a technique that trades compute for memory.
Instead of keeping tensors needed for backward alive until they are used in
@@ -4547,7 +4547,7 @@
torch.utils.checkpoint
-
-torch.utils.checkpoint.checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwargs)[source]#
+torch.utils.checkpoint.checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwargs)[source]#
Checkpoint a sequential model to save memory.
Sequential models execute a list of modules/functions in order
(sequentially). Therefore, we can divide such a model in various segments
@@ -4597,7 +4597,7 @@
torch.utils.checkpoint
-
-torch.utils.checkpoint.set_checkpoint_debug_enabled(enabled)[source]#
+torch.utils.checkpoint.set_checkpoint_debug_enabled(enabled)[source]#
Context manager that sets whether checkpoint should print additional debug
information when running. See the debug flag for
checkpoint() for more information. Note that
@@ -4613,7 +4613,7 @@
torch.utils.checkpoint
-
-class torch.utils.checkpoint.CheckpointPolicy(value)[source]#
+class torch.utils.checkpoint.CheckpointPolicy(value)[source]#
Enum for specifying the policy for checkpointing during backpropagation.
The following policies are supported:
@@ -4637,7 +4637,7 @@ torch.utils.checkpoint
-
-class torch.utils.checkpoint.SelectiveCheckpointContext(*, is_recompute)[source]#
+class torch.utils.checkpoint.SelectiveCheckpointContext(*, is_recompute)[source]#
Context passed to policy function during selective checkpointing.
This class is used to pass relevant metadata to the policy function during
selective checkpointing. The metadata includes whether the current invocation
@@ -4660,7 +4660,7 @@
torch.utils.checkpoint
-
-torch.utils.checkpoint.create_selective_checkpoint_contexts(policy_fn_or_list, allow_cache_entry_mutation=False)[source]#
+torch.utils.checkpoint.create_selective_checkpoint_contexts(policy_fn_or_list, allow_cache_entry_mutation=False)[source]#
Helper to avoid recomputing certain ops during activation checkpointing.
Use this with torch.utils.checkpoint.checkpoint to control which
operations are recomputed during the backward pass.
diff --git a/2.9/cond.html b/2.9/cond.html
index bc6ea654c39..32f0d5759ea 100644
--- a/2.9/cond.html
+++ b/2.9/cond.html
@@ -4564,7 +4564,7 @@ Invariants of torch.ops.higher_order.cond#
-
-torch._higher_order_ops.cond.cond(pred, true_fn, false_fn, operands=())[source]#
+torch._higher_order_ops.cond.cond(pred, true_fn, false_fn, operands=())[source]#
Conditionally applies true_fn or false_fn.
Warning
diff --git a/2.9/config_mod.html b/2.9/config_mod.html
index 57eae9ff84e..cd6cbd0e499 100644
--- a/2.9/config_mod.html
+++ b/2.9/config_mod.html
@@ -4396,7 +4396,7 @@
Created On: Apr 09, 2019 | Last Updated On: Jun 13, 2025
-
-torch.__config__.show()[source]#
+torch.__config__.show()[source]#
Return a human-readable string with descriptions of the
configuration of PyTorch.
@@ -4408,7 +4408,7 @@
-
-torch.__config__.parallel_info()[source]#
+torch.__config__.parallel_info()[source]#
Returns detailed string with parallelization settings
- Return type
diff --git a/2.9/cpp_extension.html b/2.9/cpp_extension.html
index 044271d1e08..38b719bb8e1 100644
--- a/2.9/cpp_extension.html
+++ b/2.9/cpp_extension.html
@@ -4396,7 +4396,7 @@ torch.utils.cpp_extensionCreated On: Mar 07, 2018 | Last Updated On: Feb 16, 2025
-
-torch.utils.cpp_extension.CppExtension(name, sources, *args, **kwargs)[source]#
+torch.utils.cpp_extension.CppExtension(name, sources, *args, **kwargs)[source]#
Create a setuptools.Extension for C++.
Convenience method that creates a setuptools.Extension with the
bare minimum (but often sufficient) arguments to build a C++ extension.
@@ -4441,7 +4441,7 @@ torch.utils.cpp_extension
-
-torch.utils.cpp_extension.CUDAExtension(name, sources, *args, **kwargs)[source]#
+torch.utils.cpp_extension.CUDAExtension(name, sources, *args, **kwargs)[source]#
Create a setuptools.Extension for CUDA/C++.
Convenience method that creates a setuptools.Extension with the
bare minimum (but often sufficient) arguments to build a CUDA/C++
@@ -4546,7 +4546,7 @@
torch.utils.cpp_extension
-
-torch.utils.cpp_extension.SyclExtension(name, sources, *args, **kwargs)[source]#
+torch.utils.cpp_extension.SyclExtension(name, sources, *args, **kwargs)[source]#
Creates a setuptools.Extension for SYCL/C++.
Convenience method that creates a setuptools.Extension with the
bare minimum (but often sufficient) arguments to build a SYCL/C++
@@ -4598,7 +4598,7 @@
torch.utils.cpp_extension
-
-torch.utils.cpp_extension.BuildExtension(*args, **kwargs)[source]#
+torch.utils.cpp_extension.BuildExtension(*args, **kwargs)[source]#
A custom setuptools build extension .
This setuptools.build_ext subclass takes care of passing the
minimum required compiler flags (e.g. -std=c++17) as well as mixed
@@ -4626,7 +4626,7 @@
torch.utils.cpp_extension
-
-torch.utils.cpp_extension.load(name, sources, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, is_standalone=False, keep_intermediates=True)[source]#
+torch.utils.cpp_extension.load(name, sources, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, is_standalone=False, keep_intermediates=True)[source]#
Load a PyTorch C++ extension just-in-time (JIT).
To load an extension, a Ninja build file is emitted, which is used to
compile the given sources into a dynamic library. This library is
@@ -4726,7 +4726,7 @@
torch.utils.cpp_extension
-
-torch.utils.cpp_extension.load_inline(name, cpp_sources, cuda_sources=None, sycl_sources=None, functions=None, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, with_pytorch_error_handling=True, keep_intermediates=True, use_pch=False, no_implicit_headers=False)[source]#
+torch.utils.cpp_extension.load_inline(name, cpp_sources, cuda_sources=None, sycl_sources=None, functions=None, extra_cflags=None, extra_cuda_cflags=None, extra_sycl_cflags=None, extra_ldflags=None, extra_include_paths=None, build_directory=None, verbose=False, with_cuda=None, with_sycl=None, is_python_module=True, with_pytorch_error_handling=True, keep_intermediates=True, use_pch=False, no_implicit_headers=False)[source]#
Load a PyTorch C++ extension just-in-time (JIT) from string sources.
This function behaves exactly like load(), but takes its sources as
strings rather than filenames. These strings are stored to files in the
@@ -4823,7 +4823,7 @@
torch.utils.cpp_extension
-
-torch.utils.cpp_extension.include_paths(device_type='cpu')[source]#
+torch.utils.cpp_extension.include_paths(device_type='cpu')[source]#
Get the include paths required to build a C++ or CUDA or SYCL extension.
- Parameters
@@ -4840,7 +4840,7 @@ torch.utils.cpp_extension
-
-torch.utils.cpp_extension.get_compiler_abi_compatibility_and_version(compiler)[source]#
+torch.utils.cpp_extension.get_compiler_abi_compatibility_and_version(compiler)[source]#
Determine if the given compiler is ABI-compatible with PyTorch alongside its version.
- Parameters
@@ -4859,13 +4859,13 @@ torch.utils.cpp_extension
-
-torch.utils.cpp_extension.verify_ninja_availability()[source]#
+torch.utils.cpp_extension.verify_ninja_availability()[source]#
Raise RuntimeError if ninja build system is not available on the system, does nothing otherwise.
-
-torch.utils.cpp_extension.is_ninja_available()[source]#
+torch.utils.cpp_extension.is_ninja_available()[source]#
Return True if the ninja build system is available on the system, False otherwise.
diff --git a/2.9/cuda._sanitizer.html b/2.9/cuda._sanitizer.html
index f2228bbbb4b..a5e19903027 100644
--- a/2.9/cuda._sanitizer.html
+++ b/2.9/cuda._sanitizer.html
@@ -4505,7 +4505,7 @@ Usage#
API Reference#
-
-torch.cuda._sanitizer.enable_cuda_sanitizer()[source]#
+torch.cuda._sanitizer.enable_cuda_sanitizer()[source]#
Enable CUDA Sanitizer.
The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions
for synchronization errors. All data races found will be printed to the standard
diff --git a/2.9/cuda.html b/2.9/cuda.html
index c2f4e111b33..fabd59f6df2 100644
--- a/2.9/cuda.html
+++ b/2.9/cuda.html
@@ -4744,7 +4744,7 @@
Memory management
-
-class torch.cuda.use_mem_pool(pool, device=None)[source]#
+class torch.cuda.use_mem_pool(pool, device=None)[source]#
A context manager that routes allocations to a given pool.
- Parameters
diff --git a/2.9/cuda.tunable.html b/2.9/cuda.tunable.html
index 1827dd066d2..74534480b08 100644
--- a/2.9/cuda.tunable.html
+++ b/2.9/cuda.tunable.html
@@ -4570,7 +4570,7 @@ Environment Variable Interface#
-
-torch.cuda.tunable.enable(val=True)[source]#
+torch.cuda.tunable.enable(val=True)[source]#
This is the big on/off switch for all TunableOp implementations.
@@ -4578,7 +4578,7 @@ API Reference
-
-torch.cuda.tunable.is_enabled()[source]#
+torch.cuda.tunable.is_enabled()[source]#
Returns whether the TunableOp feature is enabled.
- Return type
@@ -4589,7 +4589,7 @@ API Reference
-
-torch.cuda.tunable.tuning_enable(val=True)[source]#
+torch.cuda.tunable.tuning_enable(val=True)[source]#
Enable tuning of TunableOp implementations.
When enabled, if a tuned entry isn’t found, run the tuning step and record
the entry.
@@ -4599,7 +4599,7 @@ API Reference
-
-torch.cuda.tunable.tuning_is_enabled()[source]#
+torch.cuda.tunable.tuning_is_enabled()[source]#
Returns whether TunableOp implementations can be tuned.
- Return type
@@ -4610,7 +4610,7 @@ API Reference
-
-torch.cuda.tunable.record_untuned_enable(val=True)[source]#
+torch.cuda.tunable.record_untuned_enable(val=True)[source]#
Enable recording untuned of TunableOp perations for offline tuning.
When enabled, if a tuned entry isn’t found, write it to the untuned file.
@@ -4619,7 +4619,7 @@ API Reference
-
-torch.cuda.tunable.record_untuned_is_enabled()[source]#
+torch.cuda.tunable.record_untuned_is_enabled()[source]#
Returns whether TunableOp operations are recorded for offline tuning.
- Return type
@@ -4630,7 +4630,7 @@ API Reference
-
-torch.cuda.tunable.set_max_tuning_duration(duration)[source]#
+torch.cuda.tunable.set_max_tuning_duration(duration)[source]#
Set max time in milliseconds to spend tuning a given solution.
If both max tuning duration and iterations are set, the smaller of the two
will be honored. At minimum 1 tuning iteration will always be run.
@@ -4640,7 +4640,7 @@ API Reference
-
-torch.cuda.tunable.get_max_tuning_duration()[source]#
+torch.cuda.tunable.get_max_tuning_duration()[source]#
Get max time to spend tuning a given solution.
- Return type
@@ -4651,7 +4651,7 @@ API Reference
-
-torch.cuda.tunable.set_max_tuning_iterations(iterations)[source]#
+torch.cuda.tunable.set_max_tuning_iterations(iterations)[source]#
Set max number of iterations to spend tuning a given solution.
If both max tuning duration and iterations are set, the smaller of the two
will be honored. At minimum 1 tuning iteration will always be run.
@@ -4661,7 +4661,7 @@ API Reference
-
-torch.cuda.tunable.get_max_tuning_iterations()[source]#
+torch.cuda.tunable.get_max_tuning_iterations()[source]#
Get max iterations to spend tuning a given solution.
- Return type
@@ -4672,7 +4672,7 @@ API Reference
-
-torch.cuda.tunable.set_filename(filename, insert_device_ordinal=False)[source]#
+torch.cuda.tunable.set_filename(filename, insert_device_ordinal=False)[source]#
Set the filename to use for input/output of tuning results.
If insert_device_ordinal is True then the current device ordinal
will be added to the given filename automatically. This can be used in a
@@ -4683,7 +4683,7 @@
API Reference
-
-torch.cuda.tunable.get_filename()[source]#
+torch.cuda.tunable.get_filename()[source]#
Get the results filename.
- Return type
@@ -4694,7 +4694,7 @@ API Reference
-
-torch.cuda.tunable.get_results()[source]#
+torch.cuda.tunable.get_results()[source]#
Return all TunableOp results.
- Return type
@@ -4705,7 +4705,7 @@ API Reference
-
-torch.cuda.tunable.get_validators()[source]#
+torch.cuda.tunable.get_validators()[source]#
Return the TunableOp validators.
- Return type
@@ -4716,7 +4716,7 @@ API Reference
-
-torch.cuda.tunable.write_file_on_exit(val)[source]#
+torch.cuda.tunable.write_file_on_exit(val)[source]#
During Tuning Context destruction, write file to disk.
This is useful as a final flush of your results to disk if your application
terminates as result of normal operation or an error. Manual flushing of
@@ -4727,7 +4727,7 @@
API Reference
-
-torch.cuda.tunable.write_file(filename=None)[source]#
+torch.cuda.tunable.write_file(filename=None)[source]#
Write results to a CSV file.
If filename is not given, get_filename() is called.
@@ -4739,7 +4739,7 @@ API Reference
-
-torch.cuda.tunable.read_file(filename=None)[source]#
+torch.cuda.tunable.read_file(filename=None)[source]#
Read results from a TunableOp CSV file.
If filename is not given, get_filename() is called.
@@ -4751,7 +4751,7 @@ API Reference
-
-torch.cuda.tunable.tune_gemm_in_file(filename)[source]#
+torch.cuda.tunable.tune_gemm_in_file(filename)[source]#
tune GEMM in file.
@@ -4759,7 +4759,7 @@ API Reference
-
-torch.cuda.tunable.mgpu_tune_gemm_in_file(filename_pattern, num_gpus)[source]#
+torch.cuda.tunable.mgpu_tune_gemm_in_file(filename_pattern, num_gpus)[source]#
Process one or more files and distribute work over one or more GPUs.
@@ -4767,7 +4767,7 @@ API Reference
-
-torch.cuda.tunable.set_rotating_buffer_size(buffer_size)[source]#
+torch.cuda.tunable.set_rotating_buffer_size(buffer_size)[source]#
Set rotating buffer size to this value in MB, if the buffer size is greater than zero.
If less than zero, query L2 cache size. If equal to zero, means deactivate rotating buffer.
@@ -4776,7 +4776,7 @@ API Reference
-
-torch.cuda.tunable.get_rotating_buffer_size()[source]#
+torch.cuda.tunable.get_rotating_buffer_size()[source]#
Get the rotating buffer size in kilobytes.
- Return type
diff --git a/2.9/data.html b/2.9/data.html
index 3e118bb3e52..1aa79155692 100644
--- a/2.9/data.html
+++ b/2.9/data.html
@@ -4762,7 +4762,7 @@
-
-class torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=None, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *, prefetch_factor=None, persistent_workers=False, pin_memory_device='', in_order=True)[source]#
+class torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=None, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *, prefetch_factor=None, persistent_workers=False, pin_memory_device='', in_order=True)[source]#
Data loader combines a dataset and a sampler, and provides an iterable over the given dataset.
The DataLoader supports both map-style and
iterable-style datasets with single- or multi-process loading, customizing
@@ -4864,7 +4864,7 @@
-
-class torch.utils.data.Dataset[source]#
+class torch.utils.data.Dataset[source]#
An abstract class representing a Dataset.
All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite __getitem__(), supporting fetching a
@@ -4885,7 +4885,7 @@
-
-class torch.utils.data.IterableDataset[source]#
+class torch.utils.data.IterableDataset[source]#
An iterable Dataset.
All datasets that represent an iterable of data samples should subclass it.
Such form of datasets is particularly useful when data come from a stream.
@@ -4987,7 +4987,7 @@
-
-class torch.utils.data.TensorDataset(*tensors)[source]#
+class torch.utils.data.TensorDataset(*tensors)[source]#
Dataset wrapping tensors.
Each sample will be retrieved by indexing tensors along the first dimension.
@@ -4999,7 +4999,7 @@
-
-class torch.utils.data.StackDataset(*args, **kwargs)[source]#
+class torch.utils.data.StackDataset(*args, **kwargs)[source]#
Dataset as a stacking of multiple datasets.
This class is useful to assemble different parts of complex input data, given as datasets.
Example
@@ -5023,7 +5023,7 @@
-
-class torch.utils.data.ConcatDataset(datasets)[source]#
+class torch.utils.data.ConcatDataset(datasets)[source]#
Dataset as a concatenation of multiple datasets.
This class is useful to assemble different existing datasets.
@@ -5035,7 +5035,7 @@
-
-class torch.utils.data.ChainDataset(datasets)[source]#
+class torch.utils.data.ChainDataset(datasets)[source]#
Dataset for chaining multiple IterableDataset s.
This class is useful to assemble different existing dataset streams. The
chaining operation is done on-the-fly, so concatenating large-scale
@@ -5049,7 +5049,7 @@
-
-class torch.utils.data.Subset(dataset, indices)[source]#
+class torch.utils.data.Subset(dataset, indices)[source]#
Subset of a dataset at specified indices.
- Parameters
@@ -5063,7 +5063,7 @@
-
-torch.utils.data._utils.collate.collate(batch, *, collate_fn_map=None)[source]#
+torch.utils.data._utils.collate.collate(batch, *, collate_fn_map=None)[source]#
General collate function that handles collection type of element within each batch.
The function also opens function registry to deal with specific element types. default_collate_fn_map
provides default collate functions for tensors, numpy arrays, numbers and strings.
@@ -5098,7 +5098,7 @@
-
-torch.utils.data.default_collate(batch)[source]#
+torch.utils.data.default_collate(batch)[source]#
Take in a batch of data and put the elements within the batch into a tensor with an additional outer dimension - batch size.
The exact output type can be a torch.Tensor, a Sequence of torch.Tensor, a
Collection of torch.Tensor, or left unchanged, depending on the input type.
@@ -5164,7 +5164,7 @@
-
-torch.utils.data.default_convert(data)[source]#
+torch.utils.data.default_convert(data)[source]#
Convert each NumPy array element into a torch.Tensor.
If the input is a Sequence, Collection, or Mapping, it tries to convert each element inside to a torch.Tensor.
If the input is not an NumPy array, it is left unchanged.
@@ -5199,7 +5199,7 @@
-
-torch.utils.data.get_worker_info()[source]#
+torch.utils.data.get_worker_info()[source]#
Returns the information about the current
DataLoader iterator worker process.
When called in a worker, this returns an object guaranteed to have the
@@ -5233,7 +5233,7 @@
-
-torch.utils.data.random_split(dataset, lengths, generator=<torch._C.Generator object>)[source]#
+torch.utils.data.random_split(dataset, lengths, generator=<torch._C.Generator object>)[source]#
Randomly split a dataset into non-overlapping new datasets of given lengths.
If a list of fractions that sum up to 1 is given,
the lengths will be computed automatically as
@@ -5265,7 +5265,7 @@
-
-class torch.utils.data.Sampler(data_source=None)[source]#
+class torch.utils.data.Sampler(data_source=None)[source]#
Base class for all Samplers.
Every Sampler subclass has to provide an __iter__() method, providing a
way to iterate over indices or lists of indices (batches) of dataset elements,
@@ -5312,7 +5312,7 @@
-
-class torch.utils.data.SequentialSampler(data_source)[source]#
+class torch.utils.data.SequentialSampler(data_source)[source]#
Samples elements sequentially, always in the same order.
- Parameters
@@ -5323,7 +5323,7 @@
-
-class torch.utils.data.RandomSampler(data_source, replacement=False, num_samples=None, generator=None)[source]#
+class torch.utils.data.RandomSampler(data_source, replacement=False, num_samples=None, generator=None)[source]#
Samples elements randomly. If without replacement, then sample from a shuffled dataset.
If with replacement, then user can specify num_samples to draw.
@@ -5340,7 +5340,7 @@
-
-class torch.utils.data.SubsetRandomSampler(indices, generator=None)[source]#
+class torch.utils.data.SubsetRandomSampler(indices, generator=None)[source]#
Samples elements randomly from a given list of indices, without replacement.
- Parameters
@@ -5354,7 +5354,7 @@
-
-class torch.utils.data.WeightedRandomSampler(weights, num_samples, replacement=True, generator=None)[source]#
+class torch.utils.data.WeightedRandomSampler(weights, num_samples, replacement=True, generator=None)[source]#
Samples elements from [0,..,len(weights)-1] with given probabilities (weights).
- Parameters
@@ -5387,7 +5387,7 @@
-
-class torch.utils.data.BatchSampler(sampler, batch_size, drop_last)[source]#
+class torch.utils.data.BatchSampler(sampler, batch_size, drop_last)[source]#
Wraps another sampler to yield a mini-batch of indices.
- Parameters
@@ -5416,7 +5416,7 @@
-
-class torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False)[source]#
+class torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False)[source]#
Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
torch.nn.parallel.DistributedDataParallel. In such a case, each
diff --git a/2.9/ddp_comm_hooks.html b/2.9/ddp_comm_hooks.html
index fe89b45b874..b2c5645d833 100644
--- a/2.9/ddp_comm_hooks.html
+++ b/2.9/ddp_comm_hooks.html
@@ -4496,7 +4496,7 @@
Default Communication Hooksbucket is a torch.distributed.GradBucket object.
-
-torch.distributed.algorithms.ddp_comm_hooks.default_hooks.allreduce_hook(process_group, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.default_hooks.allreduce_hook(process_group, bucket)[source]#
Call allreduce using GradBucket tensors.
Once gradient tensors are aggregated across all workers, its then
callback takes the mean and returns the result.
@@ -4520,7 +4520,7 @@ Default Communication Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook(process_group, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook(process_group, bucket)[source]#
Compress by casting GradBucket to torch.float16 divided by process group size.
This DDP communication hook implements a simple gradient compression
approach that casts GradBucket tensor to half-precision floating-point format (torch.float16)
@@ -4542,7 +4542,7 @@
Default Communication Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook(process_group, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook(process_group, bucket)[source]#
Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
This DDP communication hook implements a simple gradient compression
approach that casts GradBucket tensor to half-precision
@@ -4567,7 +4567,7 @@
Default Communication Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_wrapper(hook)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_wrapper(hook)[source]#
Cast input tensor to torch.float16, cast result of hook back to input dtype.
This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
floating point format (torch.float16), and casts the resulting tensor of the given hook back to
@@ -4589,7 +4589,7 @@
Default Communication Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_wrapper(hook)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_wrapper(hook)[source]#
Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
Brain floating point format (torch.bfloat16),
@@ -4622,7 +4622,7 @@
PowerSGD Communication Hook#
-
-class torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState(process_group, matrix_approximation_rank=1, start_powerSGD_iter=1000, min_compression_rate=2, use_error_feedback=True, warm_start=True, orthogonalization_epsilon=0, random_seed=0, compression_stats_logging_frequency=10000, batch_tensors_with_same_shape=False)[source]#
+class torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState(process_group, matrix_approximation_rank=1, start_powerSGD_iter=1000, min_compression_rate=2, use_error_feedback=True, warm_start=True, orthogonalization_epsilon=0, random_seed=0, compression_stats_logging_frequency=10000, batch_tensors_with_same_shape=False)[source]#
Store both the algorithm’s hyperparameters and internal state for all gradients during training.
Particularly, matrix_approximation_rank and start_powerSGD_iter are the main hyperparameters that should be tuned by the user.
For performance, we suggest to keep binary hyperparameters use_error_feedback and warm_start on.
@@ -4674,7 +4674,7 @@ PowerSGD Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook(state, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook(state, bucket)[source]#
Implement PowerSGD algorithm.
This DDP communication hook implements PowerSGD gradient compression
algorithm described in the paper.
@@ -4739,7 +4739,7 @@
PowerSGD Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.batched_powerSGD_hook(state, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.batched_powerSGD_hook(state, bucket)[source]#
Implement simplified PowerSGD algorithm.
This DDP communication hook implements a simplified PowerSGD gradient compression
algorithm described in the paper.
@@ -4807,7 +4807,7 @@
Debugging Communication Hooks
-
-torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks.noop_hook(_, bucket)[source]#
+torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks.noop_hook(_, bucket)[source]#
Return a future that wraps the input, so it is a no-op that does not incur any communication overheads.
This hook should only be used for headroom analysis of allreduce optimization,
instead of the normal gradient synchronization.
@@ -4845,12 +4845,12 @@
Checkpointing of Communication HooksPowerSGDState has __setstate__ and __getstate__ implemented and can be used as a reference.
-
-class torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState[source]
+class torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState[source]
-
-__getstate__()[source]#
+__getstate__()[source]#
Return a Dict[str, Any] which will be pickled and saved.
process_group is not serializable and excluded from
a returned state.
@@ -4858,7 +4858,7 @@ Checkpointing of Communication Hooks
-
-__setstate__(state)[source]#
+__setstate__(state)[source]#
Take a provided state and set to this PowerSGDState instance.
process_group is set to default.
diff --git a/2.9/distributed._dist2.html b/2.9/distributed._dist2.html
index b186db024c6..019d87dc917 100644
--- a/2.9/distributed._dist2.html
+++ b/2.9/distributed._dist2.html
@@ -4865,14 +4865,14 @@
-
-class torch.distributed._dist2.ProcessGroupFactory(*args, **kwargs)[source]#
+class torch.distributed._dist2.ProcessGroupFactory(*args, **kwargs)[source]#
Bases: Protocol
Protocol for process group factories.
-
-torch.distributed._dist2.current_process_group()[source]#
+torch.distributed._dist2.current_process_group()[source]#
Get the current process group. Thread local method.
- Returns
@@ -4886,7 +4886,7 @@
-
-torch.distributed._dist2.new_group(backend, timeout, device, **kwargs)[source]#
+torch.distributed._dist2.new_group(backend, timeout, device, **kwargs)[source]#
Create a new process group with the given backend and options. This group is
independent and will not be globally registered and thus not usable via the
standard torch.distributed.* APIs.
@@ -4911,7 +4911,7 @@
-
-torch.distributed._dist2.process_group(pg)[source]#
+torch.distributed._dist2.process_group(pg)[source]#
Context manager for process groups. Thread local method.
- Parameters
@@ -4925,7 +4925,7 @@
-
-torch.distributed._dist2.register_backend(name, func)[source]#
+torch.distributed._dist2.register_backend(name, func)[source]#
Register a new process group backend.
- Parameters
diff --git a/2.9/distributed.algorithms.join.html b/2.9/distributed.algorithms.join.html
index 80c7d887f4c..dabce633a70 100644
--- a/2.9/distributed.algorithms.join.html
+++ b/2.9/distributed.algorithms.join.html
@@ -4400,7 +4400,7 @@ Generic Join Context ManagerDistributed Training with Uneven Inputs Using the Join Context Manager.
-
-class torch.distributed.algorithms.Join(joinables, enable=True, throw_on_early_termination=False, **kwargs)[source]#
+class torch.distributed.algorithms.Join(joinables, enable=True, throw_on_early_termination=False, **kwargs)[source]#
This class defines the generic join context manager, which allows custom hooks to be called after a process joins.
These hooks should shadow the
collective communications of non-joined processes to prevent hanging and
@@ -4463,7 +4463,7 @@
Generic Join Context Manager
-
-static notify_join_context(joinable)[source]#
+static notify_join_context(joinable)[source]#
Notifies the join context manager that the calling process has not yet joined.
Then, if throw_on_early_termination=True, checks if uneven inputs have been detected
(i.e. if one process has already joined) and throws an exception if so.
@@ -4491,7 +4491,7 @@ Generic Join Context Manager
-
-class torch.distributed.algorithms.Joinable[source]#
+class torch.distributed.algorithms.Joinable[source]#
This defines an abstract base class for joinable classes.
A joinable class
(inheriting from Joinable) should implement join_hook(),
@@ -4508,7 +4508,7 @@
Generic Join Context Manager
-
-