-
Notifications
You must be signed in to change notification settings - Fork 0
Test that TORCH_FEATURE_VERSION guards are used where needed #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: base_20251119_4462
Are you sure you want to change the base?
Conversation
This PR outputs chars to stream without building temporary strings. They were modified by (on fish) ``` sed -i -e 's/<< "\([^\\\']\)"/<< \'\1\'/g' (grep '<< "."' -r torch c10 aten -l) ``` and revert some invalid changes. Pull Request resolved: pytorch#167899 Approved by: https://github.com/Skylion007
# Description Fixes pytorch#114850, we will port test utils and schema check to Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: # Changes 1. Get device type with from accelerator and get_devtype helper method 2. Replace the requires cuda statement to device_type. 3. Add HAS_XPU and HAS GPU check to replace some of the HAS_XPU etc. # Notify Pull Request resolved: pytorch#166684 Approved by: https://github.com/ezyang, https://github.com/guangyey Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>
Summary: This diff would be a follow-up diff for D85883723. Test Plan: See D86719598. We are now able to publish the model. Unit test: ``` buck run fbcode//mode/opt -c remoteexecution.local=enabled fbcode//sigmoid/inference/test:test_passes -m ovr_config//triton:experimental -- -r test_triton_hop_cpu ``` Differential Revision: D87091238 Pull Request resolved: pytorch#167862 Approved by: https://github.com/XueningXu
Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: pytorch#167916 Approved by: https://github.com/Skylion007
**Summary:** Optimize scalar welford_reduce implementation, combining Welford algorithm with cascade sum to improve numerical stability. Specifically: 1. Use Welford algorithm to compute mean and variance. 2. Use cascade summation when computing sum over input for both mean and variance. **Example:** Take pytorch#141541 as an example: ``` import torch import torch.nn as nn torch.manual_seed(0) class Model(nn.Module): def __init__(self): super().__init__() self.gn = nn.GroupNorm(num_groups=32, num_channels=32) def forward(self, x): return self.gn(x) model = Model().eval() x = torch.randn(1, 32, 128, 128, 128) with torch.no_grad(): output = model(x) with torch._inductor.config.patch({"cpp.simdlen": 0}): c_model = torch.compile(model) c_output = c_model(x) print(torch.max(torch.abs(output - c_output))) print(torch.allclose(output, c_output, 1.3e-6, 1e-5)) ``` **logs** - before ``` tensor(0.0005) False ``` - After ``` tensor(1.4305e-06) True ``` **Generated code:** - before ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float* in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0); } } } tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 * tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 * tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` - After ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(float* in_out_ptr0, float* in_out_ptr1, const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr2) { auto out_ptr1 = in_out_ptr0; auto out_ptr0 = in_out_ptr1; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<float> tmp_acc0_arr[4]; for (int i = 0; i < 4; i++) { tmp_acc0_arr[i] = Welford<float>(); } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); WelfordHelper<float, float, 4096> scalar_welford_helper0(static_cast<int64_t>(524288L)); Welford<float> tmp_acc0_local = Welford<float>(); #pragma omp for for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0, &scalar_welford_helper0); } } } tmp_acc0_local = welford_combine(tmp_acc0_local, &scalar_welford_helper0); tmp_acc0_arr[tid] = tmp_acc0_local; } for (int tid = 0; tid < 4; tid++) { tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]); } in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean; in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2; } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { { auto tmp0 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp6 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp8 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp11 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp1 = static_cast<float>(2097152.0); auto tmp2 = tmp0 / tmp1; auto tmp3 = static_cast<float>(1e-05); auto tmp4 = float(tmp2 + tmp3); auto tmp5 = 1 / std::sqrt(tmp4); auto tmp7 = float(tmp5 * tmp6); auto tmp9 = decltype(tmp8)(-tmp8); auto tmp10 = float(tmp9 * tmp7); auto tmp12 = float(tmp10 + tmp11); in_out_ptr0[static_cast<int64_t>(x0)] = tmp7; in_out_ptr1[static_cast<int64_t>(x0)] = tmp12; } } } } #pragma omp parallel num_threads(4) { int tid = omp_get_thread_num(); { #pragma omp for for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { #pragma GCC ivdep for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L)) { { { auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)]; auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)]; auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)]; auto tmp2 = float(tmp0 * tmp1); auto tmp4 = float(tmp2 + tmp3); out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4; } } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, arg1_1, arg2_1 = args args.clear() assert_size_stride(arg0_1, (32, ), (1, )) assert_size_stride(arg1_1, (32, ), (1, )) assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1)) buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32) buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1 # reuse buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0 # reuse buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32) # [Provenance debug handles] cpp_fused_native_group_norm_0:1 cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5) del arg0_1 del arg1_1 del arg2_1 return (buf5, ) ``` Pull Request resolved: pytorch#162709 Approved by: https://github.com/CaoE, https://github.com/jansel
) Test Plan: CI Differential Revision: D86211542 Pull Request resolved: pytorch#167799 Approved by: https://github.com/njriasan, https://github.com/eellison
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: pytorch#167914 Approved by: https://github.com/pytorchbot
Pull Request resolved: pytorch#167198 Approved by: https://github.com/bobrenjc93
This PR fixes a bug where `torch.clamp` on MPS fails when min/max tensors have more dimensions than the input tensor. CPU already supports this broadcasting, but MPS raised a RuntimeError. Example of failing case before the fix: ```python x = torch.randn(2, 3, device="mps") min_t = torch.randn(1, 2, 3, device="mps") max_t = torch.randn(1, 2, 3, device="mps") torch.clamp(x, min=min_t, max=max_t) # RuntimeError ``` After this fix, MPS matches CPU behavior. Fixes pytorch#160734 Pull Request resolved: pytorch#165058 Approved by: https://github.com/malfet
…7734)" This reverts commit 226850c. Reverted pytorch#167734 on behalf of https://github.com/Aidyn-A due to fails on CUDA 12.8 ([comment](pytorch#167734 (comment)))
The PR pytorch#167401 reminded me that the removal of old NVTX interface is long overdue, as the header-only NVTX3 has been around for more than 5 years and is shipped with all CUDA Toolkit versions of 12+. In addition to that, `libnvToolsExt.so` was removed in CUDA Toolkit 13 and onward. Pull Request resolved: pytorch#167637 Approved by: https://github.com/eqy
…device allocator (pytorch#166831) The implementation plan of MemPool for XPU, which is the dependance of [XPUGraph](pytorch#166285), following the [RFC](pytorch#162143). - [ ] ->pytorch#166831 - [ ] pytorch#166833 - [ ] pytorch#166843 Pull Request resolved: pytorch#166831 Approved by: https://github.com/EikanWang, https://github.com/gujinghui Co-authored-by: Eikan Wang <eikan.wang@intel.com>
…lasLtWorkspace" (pytorch#167928) Summary: getCurrentCUDABlasHandle() and getCUDABlasLtWorkspace() use static mutable maps that are not protected from concurrent read-and-write. This leads to crashes. This diff adds mutexes to synchronize access to the static maps. Re-land context: This is a re-land of pytorch#167248. A few issues were addressed: - fix for a bug in fast path: premature return in getCurrentCUDABlasHandle) - fix for test flakiness (pytorch#167884) Test Plan: 1. regression tests: buck2 test \mode/opt //caffe2/test\:test_transformers_cuda https://www.internalfb.com/intern/testinfra/testrun/6192449759713581 2. Use a GPU OD, run multi-threaded tests with TSAN: buck test fbcode//mode/dev-tsan fbcode//caffe2:cuda_cublas_handle_pool_test -- --stress-runs 100 https://www.internalfb.com/intern/testinfra/testrun/14355223937501118 Differential Revision: D87111985 Pull Request resolved: pytorch#167928 Approved by: https://github.com/Skylion007
…rnels (pytorch#158250) Co-authored-by: Nikhil Gupta [nikhil.gupta2@arm.com](mailto:nikhil.gupta2@arm.com) This PR enables the use of KleidiAI INT4 kernels that directly produce BF16 outputs within PyTorch to boost LLM prefill & decode performance **This change improves decode throughput by ~15% & reduces memory required to inference the model by 50%** ### Benchmark Setup ``` Model: meta-llama/Llama-3.1-8B Test Platform: Neoverse V2 ``` ### Detailed Results | Metric | With `--compile` | Without `--compile` | |----------------------------------|---------------------------|---------------------------| | Quantization Scheme | INT4 symmetric channelwise | INT4 symmetric channelwise | | Input Precision | BF16 | BF16 | | Number of Layers Quantized | 32 | 32 | | Average Compression Ratio | 87.49% | 87.49% | | Total Quantization Time (s) | 9.62 | 10.32 | | Compile Time (First) (s) | 134.48 | 1.69 | | Compile Time (Second) (s) | 80.44 | 1.60 | | Compile Time (Subsequent) (s) | 0.19 | 0.22 | | Prefill Tokens | 54 | 54 | | Decoded Tokens | 33 | 33 | | Prefill Time (s) | 0.19 | 0.22 | | Decode Time (s) | 0.76 | 1.38 | | E2E Generation Time (s) | 0.95 | 1.60 | | Prefill Throughput (tokens/s) | 288.13 | 249.91 | | Decode Throughput (tokens/s) | 43.42 | 23.83 | Pull Request resolved: pytorch#158250 Approved by: https://github.com/malfet, https://github.com/aditew01, https://github.com/fadara01 Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: pytorch#167968 Approved by: https://github.com/pytorchbot
Update the torch-xpu-ops commit to [intel/torch-xpu-ops@1e69f4](intel/torch-xpu-ops@1e69f40), includes: - Add PTL in the default AOT target list for both Win and Lin - Use PyTorch p2p API in Copy kernel - Add event cache and event timing to XCCL - Add Float8_e8m0fnu support for copy - Add CMAKE_SYCL_COMPILER_LAUNCHER for sccache Pull Request resolved: pytorch#167698 Approved by: https://github.com/EikanWang
Exposing `_inductor.config.bucket_all_reduces_fx` similar to all_gathers, reduce_scatters with only option "all". Pull Request resolved: pytorch#167634 Approved by: https://github.com/eellison
Make the PyObject preservation scheme thread-safe with free threaded (nogil) Python. The general idea is: * Python Tensor and Storage objects always hold a strong reference to their underlying c10 object * c10 objects hold a strong reference to their Python objects if there's at least one other reference to the c10 object This is implemented in `intrusive_ptr`: * The top most bit (`kHasPyObject`) from the weakref count is now used to indicate if the `intrusive_ptr_target` has an associated PyObject. So `kHasPyObject` is one bit, the weakref count is now 31 bits and the strong refcount remains 32 bits. * When the reference count increases from one to two and `kHasPyObject` is set, we incref the associated Python object to ensure that it's kept alive. * When the reference count decreases from two to one (i.e., there are no C++ reference to the `intrusive_ptr_target` other than from the Python object), we decre the associated Python object to break the cycle. Other benefits: * We can delete a lot of the copypasta from Python internal `subtype_dealloc` * This fixes the weakref and GC bugs we had in the previous scheme. Python weakrefs on Tensors and Storages should just work as expected now. Risks: * Extra branch for reference count operations on `intrusive_ptr<TensorImpl>`, `intrusive_ptr<StorageImpl>`, and the generic `intrusive_ptr<intrusive_ptr_target>` even when we're not using Python. * It's a big change (Second attempt at pytorch#166342) Pull Request resolved: pytorch#167564 Approved by: https://github.com/albanD, https://github.com/Skylion007
Previously we hard failed if pg was "gloo". Fallback on hardcoded formulas. Pull Request resolved: pytorch#167827 Approved by: https://github.com/eellison
pytorch#166044 removes openblas from whl dependency list for AArch64+CPU build so this PR adds it back. Only affects CPU build since AArch64+CUDA uses NVPL. Pull Request resolved: pytorch#167841 Approved by: https://github.com/tinglvv, https://github.com/malfet
Use standard HIP headers for unsafeAtomicAdd. Removes copy/paste of unsafeAtomicAdd as "preview" implementation for gfx942. Pull Request resolved: pytorch#167661 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>
…rch#165067)" This reverts commit 96a4c4b. Reverted pytorch#165067 on behalf of https://github.com/jeanschmidt due to breaks internal tests see D87036515, @albanD please help the author get this PR merged ([comment](pytorch#165067 (comment)))
This reverts commit e20ca3b. Reverted pytorch#167049 on behalf of https://github.com/jeanschmidt due to breaks internal tests see D87120562, @Skylion007 please thelp the author get this PR merged ([comment](pytorch#167049 (comment)))
This reverts commit 2245d7d. Reverted pytorch#167899 on behalf of https://github.com/jeanschmidt due to need to revert in order to revert pytorch#167899 ([comment](pytorch#167899 (comment)))
This reverts commit deabb3e. Reverted pytorch#167821 on behalf of https://github.com/jeanschmidt due to Breaks internal tests, see D87148810. @Skylion007 may you help the author to get this PR merged? ([comment](pytorch#167821 (comment)))
Alas, one can not use `repeat_interleave_common` for MPS tensors, as `data_offset` is not a valid pointer to `id<MTLTensor>` On the other hand, one does not need to use `AT_DISPATCH_INDEX_TYPES` as dispatching is happening on the shader side Fixes pytorch#167924 Pull Request resolved: pytorch#167961 Approved by: https://github.com/manuelcandales
Summary: MXFP4 unit tests pass on B200, fail on RTX 5090 - disable non-B200 cases. Also add a fail w/a not implemented error for non-B200 to avoid unhelpful failure messages. Test Plan: ``` pytest -sv -k "mxfp4" test/test_scaled_matmul_cuda.py ``` Reviewers: @nWEIdia Subscribers: Tasks: Fixes pytorch#167850 Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: pytorch#167857 Approved by: https://github.com/nWEIdia, https://github.com/malfet
Upgrade all the ROCm docker images to ROCm 7.1 release version. Pull Request resolved: pytorch#166743 Approved by: https://github.com/atalman, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Prachi Gupta <prachi.gupta@amd.com>
…7860) getAllOperatorsFor returns a const reference to internal state that is protected by a lock. Presuming that the lock is necessary in the first place (about which I offer no opinion because it's unclear to what extent the GIL should help here), this is a straightforward way to cause callers to create race conditions. This should fix those race conditions by copying the state instead. I modified calling code to stop binding a const reference to the result for clarity. Differential Revision: [D87088731](https://our.internmc.facebook.com/intern/diff/D87088731/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D87088731/)! Pull Request resolved: pytorch#167860 Approved by: https://github.com/zou3519
…ytorch#161728) Resolves pytorch#161290 ## Summary Expands `dynamo/check_perf_csv.py` output capabilities with latency, compile time and memory information: - Display's measured speedup and display % from target - Added clear messaging for all passing model tests when no regression is found - Added error handling if csv file is missing ### Example (Failing Check) ```bash python benchmarks/dynamo/check_perf_csv.py -f reports-dir/inductor_training_smoketest.csv -t 1.40 ``` **Example Output:** ``` Checking inductor_training_smoketest.csv (speedup threshold >= 1.40x) hf_Bert speedup=1.005x, latency=390.8 ms/iter, compile=1.526s, mem_ratio=1.02x (eager=360.6 GB, dynamo=369.3 GB) Error 1 model(s) performance regressed hf_Bert - hf_Bert: 1.005x (< 1.40x; -28.2% from target) ``` ### Example (Passing Check) ```bash python benchmarks/dynamo/check_perf_csv.py -f reports-dir/inductor_training_smoketest.csv -t 1.40 ``` **Example Output:** ``` Checking inductor_training_smoketest.csv (speedup threshold >= 1.00x) hf_Bert speedup=1.005x, latency=390.8 ms/iter, compile=1.526s, mem_ratio=1.02x (eager=360.6 GB, dynamo=369.3 GB) All 1 model(s) passed threshold check (>= 1.00x) ``` Pull Request resolved: pytorch#161728 Approved by: https://github.com/isuruf
This reverts commit 99fdca8. Reverted pytorch#166492 on behalf of https://github.com/jeanschmidt due to Internally we still depends on the old logic, so we need to find a way to maintain backwards compatibility, for now ([comment](pytorch#166492 (comment)))
Pull Request resolved: pytorch#167772 Approved by: https://github.com/janeyx99
…orch::stable::Tensor. (pytorch#161891) This ghstack is a prerequisite for porting torchaudio C++ extensions to use torch stable ABI, see pytorch/audio#4074, pytorch/audio#4075, pytorch/audio#4076, pytorch/audio#4077, pytorch/audio#4078 Pull Request resolved: pytorch#161891 Approved by: https://github.com/mikaylagawarecki ghstack dependencies: pytorch#167772
The following tests are failing on python 3.14 on linux machine
* TestSetAffinity::test_set_affinity_in_worker_init
* Why? 3.14 makes `forkserver` the default start method for multiprocessing. With it, local functions are not pickle-able and unit test fail.
* TestIndividualWorkerQueue::test_ind_worker_queue
* Why? The test was hitting timeout. This is also related to the start method. I am increasing timeout and reducing batch size iterations to reduce total unit test time.
* Fixes pytorch#68643
Pull Request resolved: pytorch#167429
Approved by: https://github.com/aelavender, https://github.com/ramanishsingh
This reverts commit 77acc66. Reverted pytorch#166743 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](pytorch#166743 (comment)))
…7633) Pull Request resolved: pytorch#167633 Approved by: https://github.com/eellison ghstack dependencies: pytorch#167827
Not sure if the path are already properly setup so I can call 'benchmarks/dynamo/huggingface.py' in unit test directly. Let's tell from CI. Pull Request resolved: pytorch#167482 Approved by: https://github.com/v0i0, https://github.com/mlazos
Inductor may treat an outer reduction as inner reduction when the reduction ranges contains a 1. This cause some weird issue that we skip fusing with mix order reduction. While I'm still debugging why that happens, I think we should fix the decision here anyways Pull Request resolved: pytorch#167697 Approved by: https://github.com/jansel, https://github.com/v0i0
Fixes pytorch#158429 Updated LogAddExpKernel.cu to allow for complex numbers. Also, updated unittest to run test_logaddexp on CUDA with complex data types and added a unit test in test_linalg.py to compare results between CUDA and cpu. @drisspg Pull Request resolved: pytorch#163509 Approved by: https://github.com/isuruf
Enables mm out for sparse tensors Pull Request resolved: pytorch#167908 Approved by: https://github.com/malfet
…#167931) Per title 1) allows `self` argument to have the same precision as output 2) fixes broadcasting of `self` argument - it used to allocate incorrectly sized output and resize it later, causing a warning, in addmm, and error out in baddbmm 3) fixes `out` handling for `out` baddbmm overload, where the implementation used uninitialized memory in `out` instead of copying `self` to out. 4) removes couple unneeded iife patterns Pull Request resolved: pytorch#167931 Approved by: https://github.com/PaulZhang12, https://github.com/drisspg, https://github.com/malfet
…idiAI kernels (pytorch#158250)" This reverts commit 53809f9. Reverted pytorch#158250 on behalf of https://github.com/zou3519 due to reverting to see if it fixes inductor halide test failure ([comment](pytorch#158250 (comment)))
Summary: add support for symint placeholders added two test cases with dynamic reshape - dynamic info coming from tmd on placeholders - dynamic info coming from placeholders (symints) Test Plan: test_reshape_dynamic_ph test_reshape_dynamic_tmd Differential Revision: D86984100 Pull Request resolved: pytorch#167757 Approved by: https://github.com/blaine-rister
…locate test into `TestSaveLoad` (pytorch#158247) This is a follow-up to [pytorch#154333](pytorch#154333), where I initially introduced a fallback mechanism in deserialize_torch_artifact. In this revised PR: Cleaned up commit history for clarity and reproducibility. Relocated the test into the TestSaveLoad class in test_serialize.py. There were some issues with last PR so opened this PR The previous PR had inconsistencies due to local branch issues and was closed in favor of this cleaner submission. Feedback is very welcome Pull Request resolved: pytorch#158247 Approved by: https://github.com/angelayi
This reverts commit 99117c1. Reverted pytorch#167637 on behalf of https://github.com/yangw-dev due to breaks internal build with torch/csrc/profiler/stubs/cuda.cpp:4:10: fatal error: 'nvtx3/nvtx3.hpp' file not found 4 | #include <nvtx3/nvtx3.hpp>, please find a meta fella to resolve this issue and try again, diff:[D87229660] ([comment](pytorch#167637 (comment)))
This reverts commit 7ede33b. Reverted pytorch#167771 on behalf of https://github.com/eellison due to needs one fix ([comment](pytorch#167771 (comment)))
… used where needed" Splits each torch library registration in the 2.10 folder into its own file -- I had a script that parsed kernel.cpp to do this but I felt like forcing this responsibility on the user might be less error prone Compiles each file targetting 2.9 and asserts that compilation fails. (There are 2 2.9 kernels we use as negative tests where compilation is expected to succeed) [ghstack-poisoned]
… used where needed" Splits each torch library registration in the 2.10 folder into its own file -- I had a script that parsed kernel.cpp to do this but I felt like forcing this responsibility on the user might be less error prone Compiles each file targetting 2.9 and asserts that compilation fails. (There are 2 2.9 kernels we use as negative tests where compilation is expected to succeed) [ghstack-poisoned]
PR Compliance Guide 🔍Below is a summary of compliance checks for this PR:
Compliance status legend🟢 - Fully Compliant🟡 - Partial Compliant 🔴 - Not Compliant ⚪ - Requires Further Human Verification 🏷️ - Compliance label |
||||||||||||||||||||||||
|
/agentic_review |
|
@sentry review |
PR Code Suggestions ✨Explore these optional code suggestions:
|
||||||||||||||||||
|
@sentry review |
|
@sentry review |
User description
Splits each torch library registration in the 2.10 folder into its own file -- I had a script that parsed kernel.cpp to do this but I felt like forcing this responsibility on the user might be less error prone
Compiles each file targetting 2.9 and asserts that compilation fails. (There are 2 2.9 kernels we use as negative tests where compilation is expected to succeed)
Stack from ghstack (oldest at bottom):
PR Type
Enhancement, Bug fix, Tests
Description
This is a large, multi-faceted PR that includes several major refactoring efforts and improvements across the PyTorch codebase:
PyObject Lifecycle Management Refactoring:
Simplified PyObject preservation and reference counting in
intrusive_ptr,TensorImpl, andStorageImplReplaced complex
MaybeOwnedwrapper with direct tensor storage and atomic PyObject slot managementAdded thread-safe PyObject initialization with atomic compare-exchange patterns
Removed resurrection logic and simplified Python object lifecycle tracking
Thread Safety Improvements:
Added mutex protection to cuBLAS workspace management with double-checked locking
Improved JIT operator registry thread safety by returning copies instead of references
Enhanced PyInterpreter interface with
try_incref()andrefcnt()methodsFixed cudagraph reference counting logic to account for multiple references
ROCm/HIP Removal:
Removed ROCm-specific code from static CUDA launcher, triton heuristics, and BLAS implementations
Simplified kernel binary format handling to CUDA-only (
cubin)Removed HIP-specific atomic add implementations and conditional compilation blocks
Device-Agnostic and Multi-Device Support:
Refactored distributed tests to use device-agnostic APIs and multi-device instantiation
Updated test utilities to support XPU alongside CUDA
Added device type detection and lazy initialization for checkpoint operations
Improved backend specification in distributed test decorators
Filesystem Dependency Removal:
Replaced
c10::filesystemwith custom cross-platform file utilitiesUpdated logging, exception handling, and JIT components to use string manipulation instead of filesystem APIs
Inductor and Compilation Improvements:
Simplified memory coalescing analysis by removing broadcast detection
Improved Welford reduction helper handling in C++ codegen
Added all-reduce bucketing pass configuration for distributed operations
Fixed
fx_wrappermode to properly handle symbolic scalars and flatten argumentsAdded SIMD tiling score simplification
Numeric and Kernel Enhancements:
Added complex number support to logaddexp operations
Added MXFP4 GPU support validation for B200/B300 devices
Refactored CUDA BLAS bias handling with optional parameters
Added XPU graph memory pool management
Test Coverage Expansion:
Added complex number logaddexp CPU vs CUDA tests
Added thread safety tests for gradients and storage
Added run-to-run determinism tests for inductor models
Added data pointer accessor tests for stable ABI
Added MPS regression and broadcasting tests
Updated variable naming in dynamic shape and auto-functionalize tests
API Deprecations:
Added deprecation annotations to
_check_is_sizeandguard_size_obliviousfunctionsUpdated usages to use alternative APIs
Configuration and Utilities:
Added
bucket_all_reduces_fxconfiguration options for distributed operationsEnhanced performance CSV checking with detailed metrics
Added weights-only safety checks to model deserialization
Improved dataloader worker affinity testing
Diagram Walkthrough
File Walkthrough
18 files
test_dynamic_shapes.py
Update variable naming in dynamic shape test assertionstest/test_dynamic_shapes.py
simplified names (
ge,ge_1,ge_2, etc.) instead of numbered suffixes(
ge_1,ge_2,ge_3, etc.)variables in dynamic shape assertions
test_auto_functionalize.py
Update variable naming in auto-functionalize test outputstest/inductor/test_auto_functionalize.py
(
geinstead ofge_1)match new naming scheme
naming patterns
test_higher_order_ops.py
Reduce operation counts and remove size check operationstest/dynamo/test_higher_order_ops.py
9, 8 to 7, 17 to 15, 13 to 11)
_check_is_sizeoperation calls from expected IR output stringstest_linalg.py
Add complex number logaddexp CPU vs CUDA testtest/test_linalg.py
test_logaddexp_cpu_vs_cuda_complex()for complexnumber logaddexp operations
cases (infinity, NaN)
implementations
test_matmul_cuda.py
Expand addmm/baddmm tests with broadcast and output variantstest/test_matmul_cuda.py
Nandbatch_sizeparameters intest_addmm_baddmm_dtype_overload()broadcast_selfandhigh_precision_selfto testmethod
create_inputs()function to handle broadcast shapes forctensor
outvariant ofaddmmandbaddbmmoperationsdtypes
test_libtorch_agnostic.py
Add data pointer retrieval tests for stable ABItest/cpp_extensions/test_libtorch_agnostic.py
get_supported_dtypes()function listing all supported dtypes forstable ABI
test_get_any_data_ptr()andtest_get_template_any_data_ptr()flags
@skipIfTorchVersionLessThan(2, 10)fornew tests
test_deterministic.py
Add run-to-run determinism test for inductor modelstest/inductor/test_deterministic.py
test_run2run_determinism()with parametrizationfor model names, training/inference modes, and precision types
backend
deterministic mode enabled
configuration
test_inductor_collectives.py
Add gloo backend NCCL estimator regression testtest/distributed/test_inductor_collectives.py
_passfunction for bucketing all-reduce operationsbucket_modeparameter to inductor config patchtest_regression_use_nccl_estimate_with_gloo()for gloo backend compatibility
@requires_gloo()decorator to new test methodtest_mps.py
Add MPS regression and broadcasting teststest/test_mps.py
test_repeat_interleave_offsetfor issue Crash on MPS when using repeat_interleave with sliced tensor pytorch/pytorch#167924test_clamp_tensor_bounds_broadcastingto verify clampoperation with tensor bounds and broadcasting
test_clamp_maxmethodtest_fake_distributed.py
Update fake distributed test expected outputtest/dynamo/test_fake_distributed.py
naming
ge_1,ge_3,ge_5toge,ge_1,ge_2forconsistency
test_mix_order_reduction.py
Expand rms_norm_bwd test coverage with new shapestest/inductor/test_mix_order_reduction.py
(1000000, 256)totest_rms_norm_bwdtestadd_1dimparametrization to test with additional dimensionadd_1dimparameter
test_serialize.py
Add torch artifact deserialization testtest/export/test_serialize.py
deserialize_torch_artifactfunctiontest_deserialize_torch_artifact_dictto verifydeserialization of dictionary objects
test_autograd.py
Add gradient thread safety testtest/test_autograd.py
test_grad_thread_safetyto verify thread-safe access totensor gradients
ThreadPoolExecutorto concurrently access gradients andverify consistency
test_torchinductor.py
Add inner reduction detection testtest/inductor/test_torchinductor.py
test_inner_reduction_detectionto verify reduction hintdetection
ReductionHint.OUTERin generatedcode
test_custom_operators.cpp
Update tests for thread-safe operator registry changestest/cpp/jit/test_custom_operators.cpp
auto&references toautofor operator retrieval callsreferences
cuda_cublas_handle_pool_test.cpp
Add concurrent access test for cuBLAS handle poolaten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
management
workspace clearing
getCurrentCUDABlasHandle()andgetCUDABlasLtWorkspace()test_scalartype.cpp
Add test for quantized integer type detectiontest/cpp/aoti_abi_check/test_scalartype.cpp
TestScalarType::isQIntTypeto verify quantized integertype detection
scalar types)
test_custom_ops.cpp
Update custom operator test for thread-safe registrytest/custom_operator/test_custom_ops.cpp
auto&toautofor operator retrieval from registry64 files
test_utils.py
Refactor tests for device-agnostic GPU supporttest/test_utils.py
accelerator API calls
device_typevariable usingtorch.accelerator.current_accelerator()for cross-device supportHAS_CUDAwithTEST_GPUflag checking both XPU and CUDAavailability
torch.get_device_module()andtorch.acceleratorAPIs instead oftorch.cudadirectlydevice_typevariable for GPUtests
test_2d_composability.py
Simplify backend selection and fix decorator orderingtest/distributed/_composable/test_composability/test_2d_composability.py
curr_backendvariable that was derived fromdist.get_default_backend_for_device()backendproperty to use hardcoded backend strings based onTEST_XPUflag@with_commsbefore
@skip_if_lt_x_gpu)NCCL backends
test_ddp_hooks.py
Migrate to MultiProcessTestCase with NCCL-specific setuptest/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
DistributedTestBasetoMultiProcessTestCasesetUp()andtearDown()methods with process spawning and filecleanup
_get_process_group_nccl()method for NCCL process groupinitialization
@requires_accelerator_dist_backend()decorators with@requires_nccl()implementations
gpus_for_rank()function to usetorch.cuda.device_count()directly
test_c10d_object_collectives.py
Add device type instantiation for multi-device testingtest/distributed/test_c10d_object_collectives.py
TEST_HPUandTEST_CUDAflagstorch.acceleratorcalls with explicit devicemodule selection
instantiate_device_type_tests()call to generate device-specifictest variants
deviceparameterwith_commsdecorator to pass device information to testmethods
tiling_utils.py
Simplify memory coalescing analysis and remove broadcast detectiontorch/_inductor/tiling_utils.py
find_broadcast_var()function that identified broadcastpatterns in memory access
try_get_buf_size()helper function for buffer size retrievaluncoalesced_addrsfield fromCoalesceVarAnalysisdataclassget_score()function signature by removingbuf_namesparameter
constraints and broadcast variable handling
test_pp_composability.py
Update backend requirements and GPU availability checkstest/distributed/_composable/test_composability/test_pp_composability.py
@requires_accelerator_dist_backend()decorators to specifybackend list
["nccl", "xccl"]at_least_x_gpu()checks withTEST_MULTIGPUandTEST_XPUflagsTEST_MULTIGPUandTEST_XPUfrom common test utilitiestest_scaled_matmul_cuda.py
Add MXFP4 SM120+ device skip conditionstest/test_scaled_matmul_cuda.py
SM120OrLaterto imports from common CUDA utilitiessupported on B200/B300)
test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(),test_mxfp8_scaled_grouped_mm_2d_3d(), andtest_blockwise_mxfp8_nvfp4_mxfp4_numerics()triton_heuristics.py
Remove fbcode and ROCm-specific logic from triton heuristicstorch/_inductor/runtime/triton_heuristics.py
is_fbcodefromtorch._environmenthsacovs
cubin)is_fbcode()for heuristic tuning.cubinextensioncheck_perf_csv.py
Enhance performance CSV checking with detailed metricsbenchmarks/dynamo/check_perf_csv.py
(latency, compilation time, memory ratio)
from target
static_cuda_launcher.py
Remove ROCm support from static CUDA launchertorch/_inductor/runtime/static_cuda_launcher.py
hsacobinary formatcubinformatis_rocmflag and associated conditional logickernel ABI
comm_analysis.py
Simplify NCCL estimator error handling and backend checkstorch/_inductor/comm_analysis.py
_nccl_estimate()functiontorch.distributed.is_nccl_available()before using estimator
cpp.py
Improve welford reduction helper handling in C++ codegentorch/_inductor/codegen/cpp.py
reduction_combine()to pass helper value towelford_combine()when available
need_use_acc_helper()to always use helper forwelford_reduce(removed scalar check)
welford_helper_cseforwelford reductions
alongside sum reductions
test_dataloader.py
Simplify dataloader worker affinity testingtest/test_dataloader.py
test_ind_worker_queue()to use fixed batch sizes and workercounts
calculation
SetAffinityDatasetto accept and store expected affinity value_worker_set_affinity_init()function for worker initializationinstead of worker function
profiler.py
Add Python 3.2 compatibility and improve type annotationstorch/autograd/profiler.py
ContextDecoratorfor Python < 3.2compatibility
Optional[]instead of|union syntaxfor compatibility
record_functionbase class to use_ContextDecoratorwithpyrefly ignore comment
test_zero_redundancy_optimizer.py
Refactor device type detection and determinism handlingtest/distributed/optim/test_zero_redundancy_optimizer.py
contextmanagerimport from contextlibdeterministic_algorithmscontext manager with directtorch.use_deterministic_algorithmscallsget_devtypefromtorch.testing._internal.common_fsdpget_devtype()instead of customlogic
test_binary_ufuncs.py
Add torch.complex32 support to binary ufuncstest/test_binary_ufuncs.py
torch.complex32support tologaddexpandlogaddexp2operationstorch.complex32in CUDA teststorch.complex32in test helper functionscommon_methods_invocations.py
Update logaddexp dtype configuration and test skipstorch/testing/_internal/common_methods_invocations.py
logaddexpdtype support to includetorch.complex32for CUDAtest_python_ref_executorto expected failures for complex typestest_c10d_functional_native.py
Refactor distributed test to use MultiProcessTestCasetest/distributed/test_c10d_functional_native.py
DistributedTestBasetoMultiProcessTestCase["nccl", "xccl"]setUpmethod to spawn processescreate_pgcall with manual process group initialization usingFileStoresimd.py
Simplify SIMD tiling score calculationtorch/_inductor/codegen/simd.py
total_uncoalescedcalculation and related penalty scoringlogic
score_modfunction to only consider tile size penaltiesops.py
Add data pointer accessor functionstest/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
get_any_data_ptrfunction to return tensor data pointer valueget_template_any_data_ptrfunction for template-based datapointer retrieval with dtype checking
test_cpu_repro.py
Add simdlen parametrization to CPU testtest/inductor/test_cpu_repro.py
simdlenvalues[None, 0]anddynamicvalues[True, False]config.patchto setcpp.simdlenconfigurationcommon_dtensor.py
Simplify distributed tensor backend detectiontorch/testing/_internal/distributed/_tensor/common_dtensor.py
ACCELERATOR_DIST_BACKENDS"nccl"in backend stringtest_device_mesh.py
Remove HPU skip condition from device mesh testtest/distributed/test_device_mesh.py
TEST_HPUfrom skip condition in test decoratorpost_grad.py
Add all-reduce bucketing pass configurationtorch/_inductor/fx_passes/post_grad.py
config.bucket_all_reduces_fxis enabledserialize.py
Add weights_only safety check to deserializationtorch/_export/serde/serialize.py
deserialize_torch_artifactto first attempt loading withweights_only=Trueweights_only=Falseon exception with warning logcheckpoint.py
Implement lazy device type detection for checkpointtorch/utils/checkpoint.py
_default_device_typeinitialization from hardcoded"cuda"toNoneget_device_typeto detect devicetype on first call
test_sparse.py
Enable sparse mm test on MPS devicetest/test_sparse.py
@onlyCPUdecorator fromtest_mmmethod@dtypesIfMPSdecorator with float32 and complex64 supporttest_opaque_obj_v2.py
Replace deprecated _check_is_size usagetest/test_opaque_obj_v2.py
torch._check_is_size(u0)call withtorch._check(u0 >= 0)python_variable.cpp
Refactor tensor Python object wrapping and lifecycletorch/csrc/autograd/python_variable.cpp
using torch::utils::PyObjectPreservationdeclarationTHPVariable_Wrapto use newTHPVariable_WrapWithTypetemplate function
PyObjectPreservationutilityTHPVariable_traverseandTHPVariable_clearto simplifiedimplementations
THPVariable_NewWithVarfunction in favor of template-basedapproach
Storage.cpp
Refactor storage Python object lifecycle managementtorch/csrc/Storage.cpp
using torch::utils::PyObjectPreservationdeclarationTHPStorage_NewWithStorageto usePyObjectPreservation::init_fresh_nonatomicTHPStorage_Wrapto use new preservation utilityTHPStorageMetaTypemetaclass definitionTHPStorageTypeto use standardPyType_Typeas metaclassBlas.cpp
Refactor CUDA BLAS bias handling and dtype checksaten/src/ATen/native/cuda/Blas.cpp
launchGemmAndBiasCublasLtto acceptstd::optionalfor biasparameter
addmm_out_cuda_implto computeuse_bias_ptr_ltearlier andpass optional bias
is_bmmparameter frombaddbmm_bmm_out_dtype_checksfunction_baddbmm_dtype_cudato properly initialize output tensor andcopy self
_addmm_dtype_out_cudaXPUCachingAllocator.cpp
Add XPU graph memory pool managementc10/xpu/XPUCachingAllocator.cpp
XPUAllocatorclassPrivatePoolstruct to manage memory pools for XPU graphsMempoolIdHashhash function for mempool IDsBlockPoolto track ownerPrivatePoolgraph_poolsandgraph_pools_freeablemaps
get_poolto support graph-specific memory poolsrelease_cached_blocksto handle graph-specific pool cleanupcreate_or_incref_poolandget_private_poolmethodsmallocandemptyCacheto support mempool IDsScaledBlas.cpp
Add MXFP4 GPU support validationaten/src/ATen/native/cuda/ScaledBlas.cpp
_check_mxfp4_supportfunction to validate MXFP4 support onB200/B300 GPUs
_scaled_mxfp4_mxfp4functionstatic_cuda_launcher.cpp
Remove ROCm support from static CUDA launchertorch/csrc/inductor/static_cuda_launcher.cpp
USE_CUDA || USE_ROCMtoUSE_CUDA &&!USE_ROCMwith explanatory commentUSE_ROCMconditional code blocks and HIP-specific includesmodel_package_loader.cpp
Replace c10::filesystem with custom cross-platform file utilitiestorch/csrc/inductor/aoti_package/model_package_loader.cpp
c10::filesystemand replaced with customimplementations
file_exists(),recursive_mkdir(), andrecursive_rmdir()helperfunctions
accessandF_OKc10::filesystemPyInterpreter.cpp
Simplify PyObject reference management in PyInterpretertorch/csrc/PyInterpreter.cpp
decref()signature by removinghas_pyobj_slotparametertry_incref()andrefcnt()to PyInterpreter interfacedecref()implementation
set_tensor_attr_with_capsule()andget_set_cached_attr()touse simplified PyObject access
operator.cpp
Improve thread safety of operator registry accesstorch/csrc/jit/runtime/operator.cpp
getOperatorsWithLockHeld()private method for lock-protectedoperator retrieval
getOperators()return type from reference to value (copy) forthread safety
getSortedOperators()method to centralize operator sorting logicgetAllSortedOperatorsFor()to delegate togetSortedOperators()pyobject_preservation.cpp
Refactor PyObject preservation with atomic initializationtorch/csrc/utils/pyobject_preservation.cpp
clear_slots()implementation with newPyObjectPreservationclass
init_fresh_nonatomic()method for initializing PyObject on freshtargets
init_once()method with atomic compare-exchange for thread-safeinitialization
Module.cpp
Simplify tensor PyObject management and remove MaybeOwned wrappertorch/csrc/Module.cpp
THPVariable.cdatafromc10::MaybeOwnedtoat::TensorTHPModule_swap_tensor_impl()to use local tensor copiesinstead of complex PyObject slot manipulation
store_pyobj()instead ofinit_pyobj()!defined(USE_ROCM)to StaticCudaLauncherinitialization
kernel.cpp
Add data pointer accessor functions to test extensiontest/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/kernel.cpp
get_any_data_ptr()function to retrieve tensor data pointersget_template_any_data_ptr()templated function with scalar typedispatch
STABLE_TORCH_LIBRARY_FRAGMENTandSTABLE_TORCH_LIBRARY_IMPLStorageImpl.cpp
Add PyObject reference management methods to StorageImplc10/core/StorageImpl.cpp
incref_pyobject()method with acquire fence for proper memoryordering
decref_pyobject()method for PyObject reference managementtry_incref_pyobject()method with interpreter availability checkTensorImpl.cpp
Add PyObject reference management methods to TensorImplc10/core/TensorImpl.cpp
pyobj_slot_.maybe_destroy_pyobj()call fromrelease_resources()incref_pyobject()method with acquire fence for proper memoryordering
decref_pyobject()method for PyObject reference managementtry_incref_pyobject()method with interpreter availability checkjit_log.cpp
Replace filesystem utilities with string manipulationtorch/csrc/jit/jit_log.cpp
c10::filesystem::pathusage with custom string manipulationStripBasename()and stringoperations
is_enabled()andjit_log_prefix()to use new string utilitiesinit.cpp
Update JIT Python bindings for thread-safe operator accesstorch/csrc/jit/python/init.cpp
const auto&toautofor operator retrieval in three locationsreferences
Logging.cpp
Remove filesystem dependency from logging utilitiesc10/util/Logging.cpp
#includedependencyc10::filesystem::path(file).filename()withStripBasename()utility
inline_container.cc
Allow multiple serialization of triton binary filescaffe2/serialize/inline_container.cc
.so,.cubin,.hsaco)PyInterpreter.cpp
Update noop PyInterpreter for simplified interfacec10/core/impl/PyInterpreter.cpp
NoopPyInterpreterVTable::decref()signature to removehas_pyobj_slotparametertry_incref()method returning falserefcnt()method with panic assertionjit_opt_limit.cpp
Replace filesystem utilities in JIT optimization limittorch/csrc/jit/jit_opt_limit.cpp
c10::filesystem::pathwith custom string utilities#includeand#includeStripBasename()andExcludeFileExtension()for path manipulationshim_common.cpp
Add data pointer accessor functions to AOTI shimtorch/csrc/shim_common.cpp
torch_get_const_data_ptr()function to retrieve const tensordata pointers
torch_get_mutable_data_ptr()function to retrieve mutable tensordata pointers
Exception.cpp
Remove filesystem dependency from exception utilitiesc10/util/Exception.cpp
#includedependencyc10::filesystem::path(file).filename()withdetail::StripBasename()StorageMethods.cpp
Simplify Storage cdata assignmenttorch/csrc/StorageMethods.cpp
THPStorage__setCdata()by removing explicit destructor callcdataassignment fromMaybeOwned::owned()to directc10::Storageassignmentinput_buffer.cpp
Use new tensor stealability check in accumulation logictorch/csrc/autograd/input_buffer.cpp
at::caching::adjusted_use_count(v) == 1withimpl::is_tensor_stealable()callschema_matching.cpp
Update schema matching for thread-safe operator accesstorch/csrc/jit/frontend/schema_matching.cpp
const auto&toautofor operator variant retrievalalias_analysis.cpp
Update alias analysis for thread-safe operator accesstorch/csrc/jit/ir/alias_analysis.cpp
const auto&toautofor operator candidate retrievalsymbolic_shape_registry.cpp
Update symbolic shape registry for thread-safe operator accesstorch/csrc/jit/runtime/symbolic_shape_registry.cpp
auto&toautofor inplace variant operator retrievalir.cpp
Update IR node schema matching for thread-safe accesstorch/csrc/jit/ir/ir.cpp
const auto&toautofor operator candidate retrievalLogAddExpKernel.cu
Add complex number support to logaddexp CUDA kernelaten/src/ATen/native/cuda/LogAddExpKernel.cu
implementations
operations
implementations
KernelUtils.cuh
Remove ROCm-specific atomic add implementationsaten/src/ATen/native/cuda/KernelUtils.cuh
__hip_bfloat162and
__half2unsafeAtomicAddfor ROCmintrusive_ptr.h
Implement PyObject preservation in intrusive_ptrc10/util/intrusive_ptr.h
kHasPyObjectconstant to track PyObject wrapper presence inrefcount
has_pyobject()helper function to check PyObject bitTargetTraitstemplate for PyObject support configurationretain_()andreset_()to manage PyObject lifecycle withrefcount transitions
is_uniquely_owned()method for stronger uniqueness checkweak_intrusive_ptr::lock()with PyObject incref logicincref()function with PyObject managementPyObjectSlot.h
Simplify PyObjectSlot interface and add atomic accessorsc10/core/impl/PyObjectSlot.h
methods
load_pyobj()andstore_pyobj()atomic accessor methodshas_unique_reference()method to check PyObject refcountpyobj_to atomic for thread-safe accesscpp_prefix.h
Improve scalar type extraction for Welford helpertorch/csrc/inductor/cpp_prefix.h
GetScalarTypetemplate to extract scalar type from vectorizedtypes
WelfordHelper::weight_recpsto useGetScalarTypefor propertype extraction
if constexpr (IsVecType::value)conditional inwelford_combine()tensor_inl.h
Add typed data pointer accessors to stable Tensor APItorch/csrc/stable/tensor_inl.h
mutable_data_ptr()andconst_data_ptr()methods withscalar type checking
TORCH_FEATURE_VERSION >= TORCH_VERSION_2_10_0PyInterpreter.h
Update PyInterpreter interface for simplified PyObject managementc10/core/impl/PyInterpreter.h
decref()signature to removehas_pyobj_slotparametertry_incref()method takingPyObjectSlotreferencerefcnt()method to retrieve PyObject reference countPyObjectSlotTensorImpl.h
Add PyObject lifecycle management to TensorImplc10/core/TensorImpl.h
incref_pyobject(),decref_pyobject(), andtry_incref_pyobject()override methods
TargetTraitsspecialization for TensorImpl to enable PyObjectsupport
11 files
test_state_dict_utils.py
Revert to CUDA-specific device implementationstest/distributed/checkpoint/test_state_dict_utils.py
torch.accelerator.device_count()back totorch.cuda.device_count()self.device_typereferences with hardcoded"cuda"stringsis_cudaproperty instead ofdevice.typecomparisons
torch.accelerator.synchronize()withtorch.cuda.synchronize()test_static_cuda_launcher.py
Remove ROCm support and use CUDA-only cubin formattest/inductor/test_static_cuda_launcher.py
hsacovscubin)cubinformat for all kernels@skipIfRocmdecorator to all test methodstest_torch.py
Fix weak reference and storage lifecycle teststest/test_torch.py
test_storage_use_count()to expect 2 references instead of 1(accounting for wrapper)
test_as_subclass()fromRuntimeErrortoTypeErrortest_tensor_dead_weak_ref()to verify tensor stays alive viaweak reference
test_storage_dead_weak_ref()to verify storage lifecyclewith weak references
test_storage_thread_safety()method for concurrent storageaccess validation
common.py
Add weights_only parameter to torch.load callbenchmarks/dynamo/common.py
weights_only=Falseparameter totorch.load()call when loadingsaved model outputs
weights-only loading
test_fxir_backend.py
Fix fx_wrapper argument handling and add reshape teststest/inductor/test_fxir_backend.py
fx_wrappermode to flatten arguments before passing to compiledmodule
test_reshape_dynamic_phandtest_reshape_dynamic_tmdfor dynamic reshape operationscudagraph_trees.py
Fix cudagraph reference counting logictorch/_inductor/cudagraph_trees.py
expiredpropertyextra_ref_checkis set
check_refcountto handle cached tensor outputs with multiplereferences
compile_fx.py
Fix fx_wrapper input handling for symbolic scalarstorch/_inductor/compile_fx.py
Nonewhen not using
fx_wrappermodevalidation
test_codecache.py
Simplify and fix codecache test skip conditionstest/inductor/test_codecache.py
ir.py
Fix outer reduction detection for zero stridestorch/_inductor/ir.py
contain 1
_op_schema.py
Fix typo in OpStrategy string representationtorch/distributed/tensor/_op_schema.py
__str__method fromOpStragetytoOpStrategyCublasHandlePool.cpp
Add thread-safe mutex protection to cuBLAS workspace managementaten/src/ATen/cuda/CublasHandlePool.cpp
WorkspaceMapWithMutexstruct to wrap workspace map withstd::shared_mutexfor thread safetysetWorkspaceForHandle()function with double-checked lockingpattern
clearCublasWorkspaces()andgetCUDABlasLtWorkspace()to usemutex-protected access
2 files
__init__.py
Add deprecation annotation to _check_is_sizetorch/init.py
deprecatedfromtyping_extensions_check_is_sizefunction with removalnotice
symbolic_shapes.py
Add deprecation annotation to guard_size_oblivioustorch/fx/experimental/symbolic_shapes.py
guard_size_obliviousfunctionalternatives
1 files
config.py
Add all-reduce bucketing configuration optionstorch/_inductor/config.py
bucket_all_reduces_fxconfiguration option with values"none"or"all"bucket_all_reduces_fx_bucket_size_determinatoroptional callableconfiguration
1 files
kernel.cpp
Fix kernel definition indentationtest/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
m.def("test_default_constructor(bool undefined)-> bool")line34 files