diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index cd62eb5c5b752..93b2882ef475d 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -e3fb8c09b1ae675832424d7bf5f3484f697efd39 +68161e98aaeaeca02166063d19de92e81ea00c3b diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py index 21d43a24bd275..264c3737914ec 100755 --- a/.github/scripts/tryrebase.py +++ b/.github/scripts/tryrebase.py @@ -85,11 +85,13 @@ def rebase_ghstack_onto( ) -> None: if ( subprocess.run( - [sys.executable, "-m", "ghstack", "--help"], capture_output=True + [sys.executable, "-m", "ghstack", "--help"], + capture_output=True, + check=False, ).returncode != 0 ): - subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"]) + subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True) orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}" repo.fetch(orig_ref, orig_ref) @@ -116,7 +118,7 @@ def rebase_ghstack_onto( if dry_run: print("Don't know how to dry-run ghstack") else: - ghstack_result = subprocess.run(["ghstack"], capture_output=True) + ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True) push_result = ghstack_result.stdout.decode("utf-8") print(push_result) if ghstack_result.returncode != 0: diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index fd5989b631462..ef263c5a3d656 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -30,6 +30,8 @@ jobs: run: | tag_or_branch="${PT_GITHUB_REF#refs/tags/}" tag_or_branch="${tag_or_branch#refs/heads/}" + # replace directory separators with _ in branch name + tag_or_branch="${tag_or_branch//\//_}" echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV" echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" - name: Create source distribution diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp index a0fbc499378e2..0ed5f60161ab6 100644 --- a/aten/src/ATen/Utils.cpp +++ b/aten/src/ATen/Utils.cpp @@ -1,13 +1,10 @@ #include -#include #include #include #include #include -// NOLINTNEXTLINE(modernize-deprecated-headers) -#include #include #include #include diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index e0f803871114a..569a820042c0e 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -1,6 +1,5 @@ #include -#include #include #include #include diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index d8438bb4ce1b1..957b89c7a1f16 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -57,6 +57,7 @@ struct FormatGuard { out.copyfmt(saved); } private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) std::ostream & out; std::ios saved; }; @@ -65,7 +66,7 @@ std::ostream& operator<<(std::ostream & out, const DeprecatedTypeProperties& t) return out << t.toString(); } -static std::tuple __printFormat(std::ostream& stream, const Tensor& self) { +static std::tuple __printFormat(std::ostream& stream, const Tensor& self) { auto size = self.numel(); if(size == 0) { return std::make_tuple(1., 0); @@ -116,13 +117,13 @@ static std::tuple __printFormat(std::ostream& stream, const Ten } } double scale = 1; - int64_t sz = 11; + int sz = 11; if(intMode) { if(expMax > 9) { sz = 11; stream << std::scientific << std::setprecision(4); } else { - sz = expMax + 1; + sz = static_cast(expMax) + 1; stream << defaultfloat; } } else { @@ -141,7 +142,7 @@ static std::tuple __printFormat(std::ostream& stream, const Ten if(expMax == 0) { sz = 7; } else { - sz = expMax+6; + sz = static_cast(expMax) + 6; } stream << std::fixed << std::setprecision(4); } @@ -163,9 +164,7 @@ static void printScale(std::ostream & stream, double scale) { } static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent) { - double scale = 0.0; - int64_t sz = 0; - std::tie(scale, sz) = __printFormat(stream, self); + auto [scale, sz] = __printFormat(stream, self); __printIndent(stream, indent); int64_t nColumnPerLine = (linesize-indent)/(sz+1); @@ -284,9 +283,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi stream << "[ " << tensor_.toString() << "{}"; } else if(tensor.ndimension() == 1) { if (tensor.numel() > 0) { - double scale = 0.0; - int64_t sz = 0; - std::tie(scale, sz) = __printFormat(stream, tensor); + auto [scale, sz] = __printFormat(stream, tensor); if(scale != 1) { printScale(stream, scale); } diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp index 825aca6a04556..cf15c44b7f3a5 100644 --- a/aten/src/ATen/core/List_test.cpp +++ b/aten/src/ATen/core/List_test.cpp @@ -3,6 +3,7 @@ using namespace c10; +// NOLINTBEGIN(performance-move-const-arg) TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) { List list; EXPECT_TRUE(list.empty()); @@ -1159,3 +1160,4 @@ TEST(ListTest, toTypedList) { genericList = impl::toList(std::move(stringList)); EXPECT_THROW(c10::impl::toTypedList(std::move(genericList)), c10::Error); } +// NOLINTEND(performance-move-const-arg) diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp index 40fd58a73ab18..dcf55dfa273cd 100644 --- a/aten/src/ATen/core/NamedTensor.cpp +++ b/aten/src/ATen/core/NamedTensor.cpp @@ -2,7 +2,6 @@ #include #include -#include namespace at { diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 7edecf5d9aa82..14b15b00e77b3 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -1210,7 +1210,7 @@ TORCH_API intrusive_ptr collectAny( TypePtr typePtr, std::vector devices) : srcFutures(srcs), - dstFuture(make_intrusive(typePtr, std::move(devices))) {} + dstFuture(make_intrusive(std::move(typePtr), std::move(devices))) {} std::atomic done{false}; List> srcFutures; intrusive_ptr dstFuture; diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp index 92cb179631af1..fef96198717e2 100644 --- a/aten/src/ATen/core/library.cpp +++ b/aten/src/ATen/core/library.cpp @@ -44,7 +44,7 @@ namespace { CppFunction::CppFunction(c10::KernelFunction func, c10::optional cpp_signature, std::unique_ptr schema) : func_(std::move(func)) - , cpp_signature_(std::move(cpp_signature)) + , cpp_signature_(cpp_signature) , schema_(std::move(schema)) , debug_() {} @@ -172,7 +172,7 @@ Library& Library::_def(std::variant&& na std::move(name), dispatch_key, std::move(f.func_), - std::move(f.cpp_signature_), + f.cpp_signature_, std::move(f.schema_), debugString(std::move(f.debug_), file_, line_) ) @@ -223,7 +223,7 @@ Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify std::move(name), dispatch_key, std::move(f.func_), - std::move(f.cpp_signature_), + f.cpp_signature_, std::move(f.schema_), debugString(std::move(f.debug_), file_, line_) ) diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 368311abd408b..3ea84cc2b6752 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -5,9 +5,13 @@ #include #include +#include +#include + namespace at::cuda { static bool _cuda_graphs_debug = false; +constexpr int kSynchronizeBusyWaitMillis = 10; MempoolId_t graph_pool_handle() { #if !defined(USE_ROCM) || ROCM_VERSION >= 50300 @@ -55,6 +59,25 @@ CaptureId_t capture_sequence_id() { * describes memory management for captures. */ +std::atomic CUDAGraph::pending_event_queries = 0; + +// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they +// can be resolved before the capture begins. Note that event queries are not allowed during a +// graph capture in the default capture mode. +void CUDAGraph::inc_pending_event_queries() { + pending_event_queries++; +} + +void CUDAGraph::dec_pending_event_queries() { + TORCH_INTERNAL_ASSERT(pending_event_queries > 0, + "Attempted to decrement the number of outstanding events to be queried, but it was <= 0."); + pending_event_queries--; +} + +int CUDAGraph::num_pending_event_queries() { + return pending_event_queries; +} + CUDAGraph::CUDAGraph() // CUDAStreams may not be default-constructed. : capture_stream_(at::cuda::getCurrentCUDAStream()) { @@ -115,6 +138,15 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt // due to the capture status being updated _after_ a capture had already started. c10::cuda::CUDACachingAllocator::beginAllocateStreamToPool(capture_dev_, capture_stream_, mempool_id_); + // At this point, any NCCL watchdogs should be aware that we are in capture mode + // and therefore should not enqueue any additional work that could be event-queried. + // We still must wait on any existing work that has not been cleaned up. + while (num_pending_event_queries()) { + TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture."); + std::this_thread::sleep_for( + std::chrono::milliseconds(kSynchronizeBusyWaitMillis)); + } + // cudaStreamCaptureModeGlobal is the most conservative option to // prevent potentially unsafe CUDA API calls during capture. See // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85 diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index 00113180e3fa0..804067560a6ea 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -5,6 +5,8 @@ #include #include +#include + namespace at { struct CUDAGeneratorImpl; @@ -19,6 +21,9 @@ struct TORCH_CUDA_CPP_API CUDAGraph { CUDAGraph(); ~CUDAGraph(); + static void inc_pending_event_queries(); + static void dec_pending_event_queries(); + static int num_pending_event_queries(); void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal); void capture_end(); void replay(); @@ -33,6 +38,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph { cudaGraphExec_t graph_exec_ = NULL; #endif + static std::atomic pending_event_queries; + // internal states so reset() can do its best cleaning up // Set to true in capture_end if cudaStreamEndCapture succeeded // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 36531b6412771..22dbb661f18b4 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -397,7 +398,7 @@ class CUDAHostAllocator { (void*)devptr == (void*)ptr, "Host and device pointer dont match with cudaHostRegister. " "Please dont use this feature by setting " - "PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)", + "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)", ""); } @@ -412,18 +413,31 @@ class CUDAHostAllocator { size_t numMapThreads = c10::cuda::CUDACachingAllocator:: CUDAAllocatorConfig::pinned_num_register_threads(); if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) { + // parallelize the mapping of pages with a threadpool auto* pool = getThreadPool(); + std::vector> promises; + std::vector> futures; + promises.reserve(numMapThreads); + futures.reserve(numMapThreads); + for (size_t i = 0; i < numMapThreads; i++) { - pool->run(std::bind( - &CUDAHostAllocator::mapPagesForRegister, - this, + promises.emplace_back(); + futures.push_back(promises[i].get_future()); + auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable { + mapPagesForRegister( *ptr, roundSize, i, // thread task-id numMapThreads, - pageSize)); + pageSize); + // set the promise when mapping pages are done + promises[i].set_value(); + }; + pool->run(task); + } + for (auto& future : futures) { + future.wait(); } - pool->waitWorkComplete(); } else { // Map pages in the same thread mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize); diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp index 2db15c8028969..c25c4972da25d 100644 --- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp +++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp @@ -17,7 +17,7 @@ namespace at { namespace functorch { // we do not support batch_group_count (which is needed for convolution backwards). // Instead, there's a convolution_backward op that needs a batching rule. static std::tuple> -convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tensor& rhs, optional rhs_bdim, const optional& bias, optional bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) { +convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tensor& rhs, optional rhs_bdim, const optional& bias, optional bias_bdim, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) { DimVector lhs_spec(stride.size() + 2); std::iota(lhs_spec.begin(), lhs_spec.end(), 0); DimVector rhs_spec = lhs_spec; @@ -43,13 +43,13 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens if (lhs_bdim && !rhs_bdim) { auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs); auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); - out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out); + out = reshape_dim_outof_symint(out_spec[0], lhs.sizes()[*lhs_bdim], out); result = std::make_tuple(out, out_spec[0]); } else if (!lhs_bdim && rhs_bdim) { if (groups == 1) { auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs); auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); - out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out); + out = reshape_dim_outof_symint(out_spec[1], rhs.size(*rhs_bdim), out); result = std::make_tuple(out, out_spec[1]); } else { if (transposed) { @@ -64,9 +64,9 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // NIHW, I(BO)HW -> N(GBO)HW auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); // N(GBO)HW -> NG(BO)HW - out = reshape_dim_outof(1, groups, out); + out = reshape_dim_outof_symint(1, groups, out); // NG(BO)HW -> NGBOHW - out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out); + out = reshape_dim_outof_symint(2, rhs.size(*rhs_bdim), out); // NGBOHW -> NB(GO)HW out = reshape_dim_into(1, 2, out); result = std::make_tuple(out, 1); @@ -78,7 +78,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // (PyTorch convs have a fixed dimension order) // B(GO)IHW -> BGOIHW - auto new_w = reshape_dim_outof(0 + (*rhs_bdim == 0), groups, rhs); + auto new_w = reshape_dim_outof_symint(0 + (*rhs_bdim == 0), groups, rhs); // BGOIHW -> G(BO)IHW new_w = reshape_dim_into(*rhs_bdim + (*rhs_bdim > 0), 1, new_w); // G(BO)IHW -> (GBO)IHW @@ -86,9 +86,9 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // N(GI)HW, (GBO)IHW -> N(GBO)HW auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); // N(GBO)HW -> NG(BO)HW - out = reshape_dim_outof(1, groups, out); + out = reshape_dim_outof_symint(1, groups, out); // NG(BO)HW -> NGBOHW - out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out); + out = reshape_dim_outof_symint(2, rhs.size(*rhs_bdim), out); // NGBOHW -> NB(GO)HW out = reshape_dim_into(1, 2, out); result = std::make_tuple(out, 1); @@ -100,7 +100,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens auto dim_with_groups = transposed ? 1 : 0; auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs); auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); - out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out); + out = reshape_dim_outof_symint(out_spec[1], lhs.sizes()[*lhs_bdim], out); result = std::make_tuple(out, out_spec[1]); } else { result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt); @@ -244,8 +244,8 @@ convolution_backward_input_batch_rule( const Tensor& grad_output, optional grad_output_bdim, const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, - IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - c10::SymIntArrayRef output_padding, int64_t groups) { + c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, c10::SymInt groups) { const std::array mask = {true, false, false}; if (grad_output_bdim && weight_bdim) { // regular: BNO, BOI -> N(BO), (BO)I -> N(BI) @@ -296,7 +296,7 @@ convolution_backward_input_batch_rule( } else { // N(GO), B(GI)O -> N(GO), (GBI)O -> N(GBI) auto weight_ = moveBatchDimToFront(weight, weight_bdim); // B(GI)O - weight_ = reshape_dim_outof(1, groups, weight_); // BGIO + weight_ = reshape_dim_outof_symint(1, groups, weight_); // BGIO weight_ = weight_.transpose(0, 1); // GBIO weight_ = weight_.flatten(0, 2); // (GBI)O const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size); @@ -306,8 +306,8 @@ convolution_backward_input_batch_rule( grad_input = std::get<0>(result); // N(GBI) } // N(GBI) -> NG(BI) -> NGBI -> NBGI -> NB(GI) - grad_input = reshape_dim_outof(1, groups, grad_input); - grad_input = reshape_dim_outof(2, batch_size, grad_input); + grad_input = reshape_dim_outof_symint(1, groups, grad_input); + grad_input = reshape_dim_outof_symint(2, batch_size, grad_input); grad_input = grad_input.transpose(1, 2); grad_input = reshape_dim_into(2, 2, grad_input); return std::make_tuple(grad_input, 1); @@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule( const Tensor& grad_output, optional grad_output_bdim, const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, - IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - c10::SymIntArrayRef output_padding, int64_t groups) { + c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, c10::SymInt groups) { const std::array mask = {false, true, false}; if (grad_output_bdim && input_bdim) { // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed) @@ -338,7 +338,7 @@ convolution_backward_weight_batch_rule( grad_output_, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups * batch_size, mask); auto grad_weight = std::get<1>(result); - grad_weight = reshape_dim_outof(0, batch_size, grad_weight); + grad_weight = reshape_dim_outof_symint(0, batch_size, grad_weight); return std::make_tuple(grad_weight, 0); } else if (grad_output_bdim && !input_bdim) { const auto batch_size = grad_output.size(*grad_output_bdim); @@ -352,11 +352,11 @@ convolution_backward_weight_batch_rule( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = reshape_dim_outof(out_ch_dim, batch_size, grad_weight); + grad_weight = reshape_dim_outof_symint(out_ch_dim, batch_size, grad_weight); return std::make_tuple(grad_weight, out_ch_dim); } else { auto grad_output_ = moveBatchDimToFront(grad_output, grad_output_bdim); // BN(GO) - grad_output_ = reshape_dim_outof(2, groups, grad_output_); // BNGO + grad_output_ = reshape_dim_outof_symint(2, groups, grad_output_); // BNGO grad_output_ = grad_output_.movedim(0, 2); // NGBO grad_output_ = grad_output_.flatten(1, 3); // N(GBO) if (!transposed) { @@ -366,7 +366,7 @@ convolution_backward_weight_batch_rule( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = grad_weight.unflatten(0, { groups, batch_size, -1 }); // GBOI + grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBOI grad_weight = grad_weight.transpose(0, 1); // BGOI grad_weight = grad_weight.flatten(1, 2); // B(GO)I return std::make_tuple(grad_weight, 0); @@ -377,7 +377,7 @@ convolution_backward_weight_batch_rule( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = reshape_dim_outof(1, batch_size, grad_weight); + grad_weight = reshape_dim_outof_symint(1, batch_size, grad_weight); return std::make_tuple(grad_weight, 1); } } @@ -393,11 +393,11 @@ convolution_backward_weight_batch_rule( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = reshape_dim_outof(in_ch_dim, batch_size, grad_weight); + grad_weight = reshape_dim_outof_symint(in_ch_dim, batch_size, grad_weight); return std::make_tuple(grad_weight, in_ch_dim); } else { auto input_ = moveBatchDimToFront(input, input_bdim); // BN(GI) - input_ = reshape_dim_outof(2, groups, input_); // BNGI + input_ = reshape_dim_outof_symint(2, groups, input_); // BNGI input_ = input_.movedim(0, 2); // NGBI input_ = input_.flatten(1, 3); // N(GBI) if (!transposed) { @@ -407,7 +407,7 @@ convolution_backward_weight_batch_rule( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = reshape_dim_outof(1, batch_size, grad_weight); + grad_weight = reshape_dim_outof_symint(1, batch_size, grad_weight); return std::make_tuple(grad_weight, 1); } else { // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O @@ -416,7 +416,7 @@ convolution_backward_weight_batch_rule( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); - grad_weight = grad_weight.unflatten(0, { groups, batch_size, -1 }); // GBIO + grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBIO grad_weight = grad_weight.transpose(0, 1); // BGIO grad_weight = grad_weight.flatten(1, 2); // B(GI)O return std::make_tuple(grad_weight, 0); @@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule( static std::tuple convolution_backward_plumbing( const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_, const c10::OptionalArrayRef bias_sizes_opt, - IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - c10::SymIntArrayRef output_padding, int64_t groups, std::array output_mask) { + c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, c10::SymInt groups, std::array output_mask) { const auto maybe_layer = maybeCurrentDynamicLayer(); vmap_check_escaped(maybe_layer, "convolution_backward_plumbing"); int64_t cur_level = maybe_layer->layerId(); diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index 9d1a681bf2c1f..64cefc90e0d85 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -314,10 +314,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) { m.impl("conv1d", native::conv1d_symint); m.impl("conv2d", native::conv2d_symint); m.impl("conv3d", native::conv3d_symint); - OP_DECOMPOSE2(conv1d, padding); - OP_DECOMPOSE2(conv2d, padding); - OP_DECOMPOSE2(conv3d, padding); - OP_DECOMPOSE(_convolution_mode); + m.impl("conv1d.padding", native::conv1d_padding_symint); + m.impl("conv2d.padding", native::conv2d_padding_symint); + m.impl("conv3d.padding", native::conv3d_padding_symint); + m.impl("_convolution_mode", native::_convolution_mode_symint); OP_DECOMPOSE(type_as); OP_DECOMPOSE(linalg_diagonal); OP_DECOMPOSE(diagonal_copy); diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 5d4cb697979e8..5d2691b9761ee 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -118,8 +118,8 @@ enum class ConvBackend { // This overload is exposed to python for testing, etc. TORCH_API ConvBackend select_conv_backend( const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, - bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt); + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, + bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt); TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input, const Tensor& weight, @@ -211,7 +211,7 @@ static void convolution_shape_check( template static inline std::vector _conv_output_size( ArrayRef input_size, ArrayRef weight_size, - ArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef() + ArrayRef padding, ArrayRef stride, ArrayRef dilation = ArrayRef() ) { // ASSERT(input_size.size() > 2) // ASSERT(input_size.size() == weight_size.size()) @@ -237,7 +237,7 @@ static inline std::vector conv_output_size( static inline std::vector conv_output_size( SymIntArrayRef input_size, SymIntArrayRef weight_size, - SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef() + SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef() ) { return _conv_output_size(input_size, weight_size, padding, stride, dilation); } @@ -245,7 +245,7 @@ static inline std::vector conv_output_size( template std::vector _conv_input_size( ArrayRef output_size, ArrayRef weight_size, - ArrayRef padding, ArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups + ArrayRef padding, ArrayRef output_padding, ArrayRef stride, ArrayRef dilation, T groups ) { // ASSERT(output_size.size() > 2) // ASSERT(output_size.size() == weight_size.size()) @@ -263,7 +263,7 @@ std::vector _conv_input_size( static inline std::vector conv_input_size( SymIntArrayRef output_size, SymIntArrayRef weight_size, - SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups + SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups ) { return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups); } diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index ef7c3bfd98f27..9c31026af54cf 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -93,7 +93,7 @@ static bool conv_benchmark_empty_cache = true; // Check workload to activate fast depthwise FP16 cudnn conv kernels template -bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) { +bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) { auto w = at::symint::size(input, 3); // same as h auto ch = at::symint::size(input, 1); auto bs = at::symint::size(input, 0); @@ -216,7 +216,7 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) { // simplified version for cudnn 8.2 and above template -bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) { +bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) { // 1D conv if(at::symint::size(input, 2) == 1 && stride == 1){ return true; @@ -269,9 +269,9 @@ static bool xnnpack_use_convolution2d( const Tensor& weight, const at::OptionalSymIntArrayRef bias_sizes_opt, const SymIntArrayRef padding, - const IntArrayRef stride, - const IntArrayRef dilation, - const int64_t groups, + const SymIntArrayRef stride, + const SymIntArrayRef dilation, + const c10::SymInt groups, const bool transposed) { // Never use xnnpack for symbolic tracing return false; @@ -284,12 +284,12 @@ static bool xnnpack_use_convolution2d( // int64_t template struct ConvParams { - std::vector stride; + std::vector stride; std::vector padding; - std::vector dilation; + std::vector dilation; bool transposed; std::vector output_padding; - int groups; + T groups; bool benchmark; bool deterministic; bool cudnn_enabled; @@ -644,7 +644,7 @@ static void check_shape_forward(const at::Tensor& input, const ConvParams& params) { int64_t k = input.ndimension(); int64_t weight_dim = weight_sizes.size(); - int64_t groups = params.groups; + auto groups = params.groups; const auto& padding = params.padding; const auto& dilation = params.dilation; bool transposed = params.transposed; @@ -831,12 +831,12 @@ at::Tensor complex_convolution( const Tensor& input, const Tensor& weight, const Tensor& bias, - IntArrayRef stride, + SymIntArrayRef stride, SymIntArrayRef padding, - IntArrayRef dilation, + SymIntArrayRef dilation, bool transposed, SymIntArrayRef output_padding, - int64_t groups) { + c10::SymInt groups) { check_input_same_type_as_parameters(input, weight, bias); Tensor i_r, i_i, w_r, w_i; std::tie(i_r, i_i) = complex_to_real(input.resolve_conj()); @@ -871,10 +871,10 @@ at::Tensor complex_convolution_mode( const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias_opt, - at::IntArrayRef stride, + c10::SymIntArrayRef stride, c10::string_view padding, - at::IntArrayRef dilation, - int64_t groups) { + c10::SymIntArrayRef dilation, + c10::SymInt groups) { auto bias = bias_opt.value_or(Tensor()); check_input_same_type_as_parameters(input, weight, bias); Tensor i_r, i_i, w_r, w_i; @@ -884,15 +884,15 @@ at::Tensor complex_convolution_mode( // See [NOTE] Complex Convolution Tensor a, b, c; if (!bias.defined()) { - a = at::_convolution_mode(i_r, w_r, bias, stride, padding, dilation, groups); - b = at::_convolution_mode(i_i, w_i, bias, stride, padding, dilation, groups); - c = at::_convolution_mode(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups); + a = at::_convolution_mode_symint(i_r, w_r, bias, stride, padding, dilation, groups); + b = at::_convolution_mode_symint(i_i, w_i, bias, stride, padding, dilation, groups); + c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups); } else { Tensor b_r, b_i; std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj()); - a = at::_convolution_mode(i_r, w_r, b_r, stride, padding, dilation, groups); - b = at::_convolution_mode(i_i, w_i, Tensor(), stride, padding, dilation, groups); - c = at::_convolution_mode(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups); + a = at::_convolution_mode_symint(i_r, w_r, b_r, stride, padding, dilation, groups); + b = at::_convolution_mode_symint(i_i, w_i, Tensor(), stride, padding, dilation, groups); + c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups); } auto i = c10::Scalar(c10::complex(0, 1)); @@ -903,7 +903,7 @@ at::Tensor complex_convolution_mode( at::Tensor conv1d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -922,7 +922,7 @@ at::Tensor conv1d_symint( at::Tensor conv2d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -949,7 +949,7 @@ at::Tensor conv2d_symint( at::Tensor conv3d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -969,7 +969,7 @@ at::Tensor conv3d_symint( static Tensor convolution_same( const Tensor &input, const Tensor &weight, const Tensor &bias, - IntArrayRef stride, IntArrayRef dilation, int64_t groups) { + SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups) { auto k = weight.dim(); TORCH_CHECK(k > 2, "weight should have at least three dimensions"); @@ -1031,10 +1031,10 @@ static Tensor convolution_same( dilation, false, output_padding, groups); } -Tensor _convolution_mode( +Tensor _convolution_mode_symint( const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, c10::string_view padding, IntArrayRef dilation, - int64_t groups) { + SymIntArrayRef stride, c10::string_view padding, SymIntArrayRef dilation, + c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -1043,18 +1043,16 @@ Tensor _convolution_mode( return at::native::convolution_same( input, weight, bias, stride, dilation, groups); } else if (padding == "valid") { - // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - const int64_t padding_[] = {0}; - return at::convolution( - input, weight, bias, stride, padding_, dilation, false, padding_, groups); + return at::convolution_symint( + input, weight, bias, stride, {{0}}, dilation, false, {{0}}, groups); } TORCH_CHECK(false, "Invalid padding string: '", padding, "'"); } -at::Tensor conv1d( +at::Tensor conv1d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, - IntArrayRef stride, c10::string_view padding, IntArrayRef dilation, - int64_t groups) { + c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, + c10::SymInt groups) { Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); @@ -1062,15 +1060,15 @@ at::Tensor conv1d( if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); } else { - output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups); } return is_batched ? std::move(output) : output.squeeze(0); } -at::Tensor conv2d( +at::Tensor conv2d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, - IntArrayRef stride, c10::string_view padding, IntArrayRef dilation, - int64_t groups) { + c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, + c10::SymInt groups) { Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); @@ -1078,15 +1076,15 @@ at::Tensor conv2d( if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); } else { - output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups); } return is_batched ? std::move(output) : output.squeeze(0); } -at::Tensor conv3d( +at::Tensor conv3d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, - IntArrayRef stride, c10::string_view padding, IntArrayRef dilation, - int64_t groups) { + c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, + c10::SymInt groups) { Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); @@ -1094,14 +1092,14 @@ at::Tensor conv3d( if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); } else { - output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups); } return is_batched ? std::move(output) : output.squeeze(0); } at::Tensor conv_transpose1d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -1122,7 +1120,7 @@ at::Tensor conv_transpose1d_symint( at::Tensor conv_transpose2d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -1143,7 +1141,7 @@ at::Tensor conv_transpose2d_symint( at::Tensor conv_transpose3d_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) { + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -1302,8 +1300,8 @@ ConvBackend _select_conv_backend( // Selects a backend for convolution based on the inputs and params. ConvBackend select_conv_backend( const Tensor& input_r, const Tensor& weight_r, const c10::optional& bias_opt, - IntArrayRef stride_, SymIntArrayRef padding_, IntArrayRef dilation_, - bool transposed_, SymIntArrayRef output_padding_, int64_t groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) { + SymIntArrayRef stride_, SymIntArrayRef padding_, SymIntArrayRef dilation_, + bool transposed_, SymIntArrayRef output_padding_, c10::SymInt groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) { c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index bc3e3505185ad..cceb8baf9b063 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -234,7 +234,7 @@ index_select_add( offsets_data = offsets_include_last.data(); } #if defined(USE_FBGEMM) - constexpr bool isbf16 = std::is_same::value ? false : true; + constexpr bool isbf16 = std::is_same_v ? false : true; auto kernel_16bit_index_t = fbgemm_kernel_cache ? fbgemm_kernel_cache ->getCallback(ddim) @@ -245,7 +245,8 @@ index_select_add( /* prefetch */ 16, /* is_weight_positional */ false, /* use_offsets */ true, - /* isbf16*/ isbf16); + /* is_bf16_out */ isbf16, + /* is_bf16_in */ isbf16); at::parallel_for( 0, output_size, 1, [&](index_t start_idx, index_t end_idx) { bool success = kernel_16bit_index_t( @@ -607,7 +608,7 @@ index_select_scale_add( auto* scale_data_fp32 = scale_fp32.mutable_data_ptr(); #if defined(USE_FBGEMM) - constexpr bool isbf16 = std::is_same::value ? false : true; + constexpr bool isbf16 = std::is_same_v ? false : true; if constexpr (isbf16) { fbgemm::Bfloat16ToFloat_simd( reinterpret_cast(scale_data), @@ -629,7 +630,8 @@ index_select_scale_add( /* prefetch */ 16, /* is_weight_positional */ false, /* use_offsets */ true, - /* isbf16*/ isbf16); + /* is_bf16_out */ isbf16, + /* is_bf16_in */ isbf16); at::parallel_for( 0, output_size, 1, [&](index_t start_idx, index_t end_idx) { bool success = kernel_16bit_index_t( diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 4bb6c258988a3..33a733273a80a 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -76,7 +76,7 @@ static inline T pooling_output_shape( template std::pair _pooling_same_mode_padding_lr( - T inputSize, T kernelSize, int64_t stride, int64_t dilation) { + T inputSize, T kernelSize, T stride, T dilation) { // NOTE: with strides, the output shape is ceil(inputSize/stride) auto total_padding = T(dilation) * (kernelSize - 1); @@ -99,8 +99,8 @@ inline std::pair pooling_same_mode_padding_lr( } inline std::pair pooling_same_mode_padding_lr( - c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) { - return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), stride, dilation); + c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) { + return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation)); } // AveragePool2d/DilatedMaxPool2d (forward) diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index d84e7adc27a14..adf23e53fa0d7 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -2096,7 +2096,27 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { && self.layout() == other.layout() && self.is_neg() == other.is_neg() && self.is_conj() == other.is_conj()) { - return true; + if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { + return true; + } + std::atomic result{true}; + auto iter = TensorIteratorConfig().add_input(self).build(); + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] { + iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) { + if (!result) { + return; + } + char* self_data = data[0]; + for (C10_UNUSED const auto i : c10::irange(dim_size)) { + if (isnan_(c10::load(self_data))) { + result = false; + return; + } + self_data += strides[0]; + } + }); + }); + return result.load(); } std::atomic result{true}; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 5522368ff64a5..bc932f05d7d0e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1615,59 +1615,67 @@ variants: method manual_cpp_binding: True -- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor +- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: convolution autogen: convolution.out tags: core -- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CompositeExplicitAutograd, CUDA: convolution_backward autogen: convolution_backward.out tags: core -- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor +- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: convolution_overrideable autogen: convolution_overrideable.out -- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) +- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) dispatch: CompositeExplicitAutograd: convolution_backward_overrideable autogen: convolution_backward_overrideable.out -- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor dispatch: CompositeExplicitAutograd: _convolution autogen: _convolution.out -- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor -- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor +- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CompositeImplicitAutograd: _convolution_mode_symint -- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) -- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor +- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv1d_symint -- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor +- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv2d_symint -- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor +- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv3d_symint -- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor +- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv1d_padding_symint -- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor +- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv2d_padding_symint -- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, str padding="valid", int[3] dilation=1, int groups=1) -> Tensor +- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv3d_padding_symint - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor dispatch: @@ -1677,15 +1685,15 @@ - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) # NB: we inherit the goofy argument order from PyTorch torch.nn.functional -- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose1d_symint -- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor +- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose2d_symint -- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor +- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose3d_symint @@ -1807,32 +1815,32 @@ CUDA: cudnn_batch_norm_backward autogen: cudnn_batch_norm_backward.out -- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor +- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution autogen: cudnn_convolution.out -- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution_transpose autogen: cudnn_convolution_transpose.out -- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: MPS: _mps_convolution_transpose autogen: _mps_convolution_transpose.out -- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor) +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor) dispatch: MPS: mps_convolution_transpose_backward autogen: mps_convolution_transpose_backward.out -- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: cudnn_convolution_relu autogen: cudnn_convolution_relu.out -- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: cudnn_convolution_add_relu autogen: cudnn_convolution_add_relu.out @@ -3923,17 +3931,17 @@ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in # native_functions.yaml # https://github.com/pytorch/pytorch/issues/77394 -- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: MPS: _mps_convolution autogen: _mps_convolution.out -- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: MPS: mps_convolution_backward autogen: mps_convolution_backward.out -- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: mkldnn_convolution autogen: mkldnn_convolution.out @@ -3959,26 +3967,26 @@ CUDA: miopen_batch_norm_backward autogen: miopen_batch_norm_backward.out -- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution autogen: miopen_convolution.out -- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution_transpose autogen: miopen_convolution_transpose.out -- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_depthwise_convolution autogen: miopen_depthwise_convolution.out -- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: miopen_convolution_relu -- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: miopen_convolution_add_relu @@ -4284,7 +4292,7 @@ - func: _nnpack_available() -> bool -- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor +- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor variants: function dispatch: CompositeExplicitAutograd: _nnpack_spatial_convolution @@ -4411,13 +4419,13 @@ CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle autogen: pixel_unshuffle.out -- func: channel_shuffle(Tensor self, int groups) -> Tensor +- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor dispatch: CPU, CUDA: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu autogen: channel_shuffle.out -- func: native_channel_shuffle(Tensor self, int groups) -> Tensor +- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor dispatch: CPU: channel_shuffle_cpu CompositeImplicitAutograd: math_channel_shuffle @@ -7239,14 +7247,14 @@ CPU: dense_to_mkldnn autogen: to_mkldnn.out -- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor +- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight autogen: mkldnn_reorder_conv2d_weight.out -- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor +- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor variants: function python_module: nn dispatch: @@ -12611,101 +12619,101 @@ # make the operational distinction clear. tags: pointwise -- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: slow_conv_transpose2d_structured_cpu CUDA: slow_conv_transpose2d_structured_cuda -- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor python_module: nn structured_delegate: slow_conv_transpose2d.out -- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv_transpose3d_out_cpu CUDA: slow_conv_transpose3d_out_cuda -- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_transpose3d_cpu CUDA: slow_conv_transpose3d_cuda -- func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) +- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -- func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor +- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor python_module: nn -- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output) -> Tensor(a!) +- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv2d_forward_out_cpu CUDA: slow_conv2d_forward_out_cuda -- func: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor +- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor python_module: nn dispatch: CPU: slow_conv2d_forward_cpu CUDA: slow_conv2d_forward_cuda -- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) +- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn dispatch: CPU: slow_conv2d_backward_out_cpu CUDA: slow_conv2d_backward_out_cuda -- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) +- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) python_module: nn dispatch: CPU: slow_conv2d_backward_cpu CUDA: slow_conv2d_backward_cuda autogen: _slow_conv2d_backward.output_mask_out -- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: nn dispatch: CUDA: conv_depthwise2d_cuda_out -- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor +- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise2d_cuda -- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor +- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise3d_cuda autogen: conv_depthwise3d.out -- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor +- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor python_module: nn -- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) +- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv3d_forward_out_cpu -- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor +- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor python_module: nn dispatch: CPU: slow_conv3d_forward_cpu -- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated2d_cpu CUDA: slow_conv_dilated2d_cuda autogen: slow_conv_dilated2d.out -- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated3d_cpu diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 400d57b7c1f8f..0511b8fd0f63c 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -936,8 +936,7 @@ if(USE_ROCM) torch_compile_options(torch_hip) # see cmake/public/utils.cmake # TODO: Not totally sure if this is live or not if(USE_NCCL) - # fmt::fmt-header-only is needed by the NCCL code in torch/csrc/distributed/c10d - target_link_libraries(torch_hip PRIVATE __caffe2_nccl fmt::fmt-header-only) + target_link_libraries(torch_hip PRIVATE __caffe2_nccl) target_compile_definitions(torch_hip PRIVATE USE_NCCL) endif() @@ -972,8 +971,7 @@ elseif(USE_CUDA) target_compile_definitions(torch_cuda PRIVATE USE_CUSPARSELT) endif() if(USE_NCCL) - # fmt::fmt-header-only is needed by the NCCL code in torch/csrc/distributed/c10d - target_link_libraries(torch_cuda PRIVATE __caffe2_nccl fmt::fmt-header-only) + target_link_libraries(torch_cuda PRIVATE __caffe2_nccl) target_compile_definitions(torch_cuda PRIVATE USE_NCCL) endif() if(USE_UCC) diff --git a/setup.py b/setup.py index 6ef6666014836..04eaf8ccc5282 100644 --- a/setup.py +++ b/setup.py @@ -926,16 +926,10 @@ def configure_extension_build(): "-Wno-unused-parameter", "-Wno-missing-field-initializers", "-Wno-unknown-pragmas", - # This is required for Python 2 declarations that are deprecated in 3. - "-Wno-deprecated-declarations", # Python 2.6 requires -fno-strict-aliasing, see # http://legacy.python.org/dev/peps/pep-3123/ # We also depend on it in our code (even Python 3). "-fno-strict-aliasing", - # Clang has an unfixed bug leading to spurious missing - # braces warnings, see - # https://bugs.llvm.org/show_bug.cgi?id=21629 - "-Wno-missing-braces", ] library_dirs.append(lib_path) diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py index d15ea96d6beea..e187279ec4c20 100644 --- a/test/distributed/_tensor/test_api.py +++ b/test/distributed/_tensor/test_api.py @@ -57,6 +57,12 @@ def test_distribute_tensor(self): self.assertTrue(dist_tensor.requires_grad) self.assertTrue(dist_tensor.is_leaf) + # test negative dim + shard_minus_spec = [Shard(-1)] + tensor_to_shard = torch.randn(3, 3 * self.world_size) + dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_minus_spec) + self.assertEqual(dist_tensor.placements[0].dim, 1) + @with_comms def test_distribute_tensor_errors(self): device_mesh = DeviceMesh( diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py index bc2315468c81a..afd63b080417f 100644 --- a/test/distributed/_tensor/test_dtensor.py +++ b/test/distributed/_tensor/test_dtensor.py @@ -194,6 +194,14 @@ def test_from_local(self): expected_grad = torch.ones(3, 3) * 9 self.assertEqual(local_tensor_with_grad.grad, expected_grad) + @with_comms + def test_from_local_negative_dim(self): + device_mesh = DeviceMesh(self.device_type, list(range(self.world_size))) + shard_spec = [Shard(-1)] + local_tensor = torch.randn(3, 3) + sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec) + self.assertEqual(sharded_tensor.placements[0].dim, 1) + @with_comms def test_to_local(self): device_mesh = DeviceMesh(self.device_type, list(range(self.world_size))) diff --git a/test/distributed/_tensor/test_math_ops.py b/test/distributed/_tensor/test_math_ops.py index 72bfd9c9d6d05..8c3ba342b087f 100644 --- a/test/distributed/_tensor/test_math_ops.py +++ b/test/distributed/_tensor/test_math_ops.py @@ -67,6 +67,7 @@ def test_softmax_fwd(self): dist_y = torch.nn.functional.softmax( dist_x, dim=softmax_dim, dtype=torch.float32 ) + shard_dim = shard_dim + dist_y.ndim if shard_dim < 0 else shard_dim self.assertTrue(dist_y.placements[0].is_shard(dim=shard_dim)) dist_y = dist_y.redistribute(device_mesh, [Replicate()]) self.assertEqual(dist_y.to_local(), local_y) @@ -102,6 +103,7 @@ def test_softmax_with_bwd(self): dist_softmax = dist_x.softmax(dim=softmax_dim) else: dist_softmax = dist_x.softmax(dim=softmax_dim) + shard_dim = shard_dim + dist_x.ndim if shard_dim < 0 else shard_dim self.assertTrue(dist_softmax.placements[0].is_shard(dim=shard_dim)) dist_y = dist_softmax.sum() dist_y = dist_y.redistribute(device_mesh, [Replicate()]) diff --git a/test/distributed/_tensor/test_redistribute.py b/test/distributed/_tensor/test_redistribute.py index b6ffdbd996740..77ace6f79d71a 100644 --- a/test/distributed/_tensor/test_redistribute.py +++ b/test/distributed/_tensor/test_redistribute.py @@ -222,6 +222,18 @@ def test_partial_to_shard(self): torch.ones(local_shape) * self.world_size, ) + @with_comms + def test_redistribute_negative_shard_dim(self): + device_mesh = DeviceMesh(self.device_type, list(range(self.world_size))) + local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True) + shard_spec = [Shard(1)] + shard_minus_spec = [Shard(-1)] + + shard_tensor = distribute_tensor(local_tensor, device_mesh, shard_spec) + self.assertEqual(shard_tensor.placements[0].dim, 1) + reshard_tensor = shard_tensor.redistribute(device_mesh, shard_minus_spec) + self.assertEqual(shard_tensor.placements[0].dim, 1) + class MultiDimRedistributeTest(DTensorTestBase): @property diff --git a/test/distributed/_tensor/test_xla_integration.py b/test/distributed/_tensor/test_xla_integration.py new file mode 100644 index 0000000000000..9d4e26fc3f02a --- /dev/null +++ b/test/distributed/_tensor/test_xla_integration.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] + +import os +import unittest +from functools import wraps +from typing import Any, Callable, Dict, Tuple + +import numpy as np +import torch +from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard +from torch.testing._internal.common_utils import run_tests, TestCase + + +# wrapper to check xla test requirements +def with_xla(func: Callable) -> Callable: + assert func is not None + + @wraps(func) # pyre-ignore[6] + def wrapper( + self, *args: Tuple[object], **kwargs: Dict[str, Any] # type: ignore[misc] + ) -> None: + # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag. + os.environ["XLA_USE_SPMD"] = "1" + try: + import torch_xla # type:ignore[import] # noqa: F401 + except ImportError as exc: + raise unittest.SkipTest("torch_xla is not installed.") from exc + self.device_type = "xla" + func(self, *args, **kwargs) # type: ignore[misc] + os.environ["XLA_USE_SPMD"] = "0" + + return wrapper + + +class DTensorXLAIntegrationTest(TestCase): + @with_xla + def test_xla_distribute_tensor_1d_shard(self): + import torch_xla.runtime as xr # type:ignore[import] + + device_count = xr.global_runtime_device_count() + if device_count > 1: + device_mesh = DeviceMesh("xla", list(range(device_count))) + shard_spec = [Shard(0)] + + for requires_grad in [True, False]: + tensor_to_shard = torch.randn( + 3 * device_count, 3, requires_grad=requires_grad + ) + dist_tensor = distribute_tensor( + tensor_to_shard, device_mesh, shard_spec + ) + # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor + assert type(dist_tensor).__name__ == "XLAShardedTensor" + global_tensor = dist_tensor.global_tensor # type:ignore[attr-defined] + self.assertEqual( + global_tensor.size(), torch.Size([3 * device_count, 3]) + ) + local_tensor = dist_tensor.local_shards[0].data + self.assertEqual(local_tensor.size(), torch.Size([3, 3])) + if requires_grad: + self.assertTrue(dist_tensor.global_tensor.requires_grad) + self.assertTrue(dist_tensor.is_leaf) + + @with_xla + def test_xla_distribute_tensor_1d_replicate(self): + import torch_xla.runtime as xr # type:ignore[import] + + device_count = xr.global_runtime_device_count() + device_mesh = DeviceMesh("xla", list(range(device_count))) + shard_spec = [Replicate()] + + for requires_grad in [True, False]: + tensor_to_shard = torch.randn( + 3 * device_count, 3, requires_grad=requires_grad + ) + dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec) + # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor + assert type(dist_tensor).__name__ == "XLAShardedTensor" + global_tensor = dist_tensor.global_tensor # type:ignore[attr-defined] + self.assertEqual(global_tensor.size(), torch.Size([3 * device_count, 3])) + local_tensor = dist_tensor.local_shards[0].data + self.assertEqual(local_tensor.size(), torch.Size([3 * device_count, 3])) + if requires_grad: + self.assertTrue(dist_tensor.global_tensor.requires_grad) + self.assertTrue(dist_tensor.is_leaf) + + @with_xla + def test_xla_distribute_tensor_2d(self): + import torch_xla.runtime as xr # type:ignore[import] + + device_count = xr.global_runtime_device_count() + if device_count > 1: + device_mesh = DeviceMesh( + "xla", np.array(range(device_count)).reshape(2, device_count // 2) + ) + shard_spec = [Replicate(), Shard(0)] + + for requires_grad in [True, False]: + tensor_to_shard = torch.randn( + 3 * device_count // 2, 3, requires_grad=requires_grad + ) + dist_tensor = distribute_tensor( + tensor_to_shard, device_mesh, shard_spec + ) + # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor + assert type(dist_tensor).__name__ == "XLAShardedTensor" + global_tensor = dist_tensor.global_tensor # type:ignore[attr-defined] + self.assertEqual( + global_tensor.size(), torch.Size([3 * device_count // 2, 3]) + ) + local_tensor = dist_tensor.local_shards[0].data + self.assertEqual(local_tensor.size(), torch.Size([3, 3])) + if requires_grad: + self.assertTrue(dist_tensor.global_tensor.requires_grad) + self.assertTrue(dist_tensor.is_leaf) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py index 207c6ad869386..fd948e191e102 100644 --- a/test/distributed/checkpoint/e2e/test_fine_tuning.py +++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py @@ -7,6 +7,7 @@ import torch.distributed as dist import torch.distributed.checkpoint as dist_cp import torch.nn as nn +from torch.distributed._tensor import init_device_mesh from torch.distributed.checkpoint.state_dict import ( get_state_dict, set_state_dict, @@ -14,8 +15,11 @@ ) from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import FSDPTest -from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.distributed._tensor.common_dtensor import ( + DTensorTestBase, + with_comms, +) from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir @@ -73,14 +77,20 @@ def forward(self, batch): return x -class TestFineTuning(FSDPTest): +class TestFineTuning(DTensorTestBase): @property def world_size(self) -> int: return min(4, torch.cuda.device_count()) + @property + def backend(self): + return "cpu:gloo,cuda:nccl" + def pretrain(self, pretrain_dir: str) -> None: + device_mesh = init_device_mesh(self.device_type, (self.world_size,)) + model = PreTrainedModel().cuda() - model = FSDP(model) + model = FSDP(model, device_mesh=device_mesh) optim = torch.optim.Adam(model.parameters(), lr=1e-3) # Trainining @@ -100,9 +110,11 @@ def pretrain(self, pretrain_dir: str) -> None: ) def finetune(self, pretrain_dir: str, finetune_dir: str) -> None: + device_mesh = init_device_mesh(self.device_type, (self.world_size,)) + model = FineTuningModel().cuda() # TODO: make the parallelism more complicated, e.g., using 2D + DDP. - model = FSDP(model, use_orig_params=True) + model = FSDP(model, use_orig_params=True, device_mesh=device_mesh) optim = torch.optim.Adam(model.parameters(), lr=1e-3) # Simulate that the fine tuning restart after 3 iterations @@ -167,6 +179,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None: ) @skip_if_lt_x_gpu(4) + @with_comms @with_temp_dir def test_fine_tuning(self) -> None: self.assertTrue(os.path.exists(self.temp_dir)) @@ -183,3 +196,7 @@ def test_fine_tuning(self) -> None: self.pretrain(pretrain_dir) self.finetune(pretrain_dir, finetune_dir) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py index 5c3c4464f50bb..41d74014839f1 100644 --- a/test/distributed/checkpoint/test_state_dict.py +++ b/test/distributed/checkpoint/test_state_dict.py @@ -10,7 +10,7 @@ import torch.nn as nn from torch.distributed._composable import fully_shard, replicate from torch.distributed._shard.sharded_tensor import ShardedTensor -from torch.distributed._tensor import DTensor +from torch.distributed._tensor import DTensor, init_device_mesh from torch.distributed.checkpoint.state_dict import ( _patch_model_state_dict, _patch_optimizer_state_dict, @@ -31,7 +31,7 @@ ) from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import FSDPTest -from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): @@ -221,11 +221,20 @@ def _test_save_load( self._verify_osd_by_load(model, optim, copy_optim, dist_osd) self._verify_osd(model, optim, osd, dist_osd) - def _test_fsdp(self, use_orig_params: bool, use_composable: bool) -> None: + def _test_fsdp( + self, use_orig_params: bool, use_composable: bool, use_dtensor: bool + ) -> None: if not use_orig_params and use_composable: return + # TODO: remove this return after we complete the composable API side change for device_mesh + if use_composable and use_dtensor: + return + def init_model_optim(): + if use_dtensor: + device_mesh = init_device_mesh("cuda", (self.world_size,)) + orig_model = CompositeParamModel(device=torch.device("cuda")) orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3) copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3) @@ -234,11 +243,21 @@ def init_model_optim(): copy.deepcopy(orig_model), policy=ModuleWrapPolicy({UnitModule}) ) else: - dist_model = FSDP( - copy.deepcopy(orig_model), - auto_wrap_policy=ModuleWrapPolicy({UnitModule}), - use_orig_params=use_orig_params, - ) + if use_dtensor: + device_mesh = init_device_mesh("cuda", (self.world_size,)) + dist_model = FSDP( + copy.deepcopy(orig_model), + auto_wrap_policy=ModuleWrapPolicy({UnitModule}), + use_orig_params=use_orig_params, + device_mesh=device_mesh, + ) + else: + dist_model = FSDP( + copy.deepcopy(orig_model), + auto_wrap_policy=ModuleWrapPolicy({UnitModule}), + use_orig_params=use_orig_params, + ) + dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3) return orig_model, orig_optim, copy_optim, dist_model, dist_optim @@ -247,7 +266,11 @@ def init_model_optim(): @skip_if_lt_x_gpu(2) def test_fsdp(self) -> None: self.run_subtests( - {"use_orig_params": [True, False], "use_composable": [True, False]}, + { + "use_orig_params": [True, False], + "use_composable": [True, False], + "use_dtensor": [True, False], + }, self._test_fsdp, ) @@ -418,3 +441,7 @@ def test_partial(self) -> None: ) self.assertEqual(model.l.weight, model_state_dict1["l.weight"]) self.assertEqual(model.l.bias, model_state_dict1["l.bias"]) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index f82909a2222c9..0c85af718904a 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -47,6 +47,7 @@ TestCase, run_tests, retry_on_connect_failures, + skipIfRocm, TEST_WITH_DEV_DBG_ASAN, TEST_WITH_ROCM, skip_but_pass_in_sandcastle, @@ -457,6 +458,32 @@ def test_allreduce_in_cudagraph(self): graph.replay() self.assertEqual(xs[0].item(), 8) + @requires_nccl() + @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs") + @skipIfRocm() + def test_nccl_watchdog_cudagraph(self): + # test that the watchdog does not crash graphs with disallowed event query + store = c10d.FileStore(self.file_name, self.world_size) + pg = self._create_process_group_nccl(store, self.opts()) + rank = self.rank_to_GPU[self.rank][0] + with torch.cuda.device(rank): + for i in range(100): + xs = [torch.FloatTensor([1]).cuda(rank)] + ys = [torch.FloatTensor([4]).cuda(rank)] + for _ in range(30): + pg.allreduce(xs[0]).wait() + + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + xs[0] += 0.0 + pg.allreduce(xs[0]).wait() + pg.allreduce(xs[0]).wait() + pg.allreduce(xs[0]).wait() + xs[0] += 0.0 + + for _ in range(1400): + graph.replay() + @requires_nccl() @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs") def test_reduce_ops(self): diff --git a/test/dynamo/test_allow_inline_skip.py b/test/dynamo/test_allow_inline_skip.py index b8c1a5e357843..e65dbdb12dd2d 100644 --- a/test/dynamo/test_allow_inline_skip.py +++ b/test/dynamo/test_allow_inline_skip.py @@ -6,9 +6,9 @@ import torch import torch._dynamo.test_case from torch._dynamo.skipfiles import ( - FILE_INLINELIST, FUNC_INLINELIST, - SUBMODULE_INLINELIST, + LEGACY_MOD_INLINELIST, + MOD_INLINELIST, ) from torch._dynamo.utils import istype @@ -36,7 +36,7 @@ class AllowInlineSkipTests(torch._dynamo.test_case.TestCase): # this unit test is to make sure the functions/modules can be correctly imported # or loaded in case there is typo in the strings. def test_skipfiles_inlinelist_correctness(self): - for m in FILE_INLINELIST.union(SUBMODULE_INLINELIST): + for m in LEGACY_MOD_INLINELIST.union(MOD_INLINELIST): self.assertTrue(isinstance(importlib.import_module(m), types.ModuleType)) for f in FUNC_INLINELIST: module_name, fn_name = f.rsplit(".", 1) @@ -54,11 +54,9 @@ def fn(x): func_inlinelist.add("torch._dynamo.utils.istype") self.assertTrue( - "torch._dynamo.utils" not in torch._dynamo.skipfiles.FILE_INLINELIST - ) - self.assertTrue( - "torch._dynamo" not in torch._dynamo.skipfiles.SUBMODULE_INLINELIST + "torch._dynamo" not in torch._dynamo.skipfiles.LEGACY_MOD_INLINELIST ) + self.assertTrue("torch._dynamo" not in torch._dynamo.skipfiles.MOD_INLINELIST) with unittest.mock.patch( "torch._dynamo.skipfiles.get_func_inlinelist", diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py index 9cb83b8e08dcb..7ad5f4e11feb0 100644 --- a/test/dynamo/test_functions.py +++ b/test/dynamo/test_functions.py @@ -1962,6 +1962,157 @@ def fn(x): res = fn(x) self.assertEqual(ref, res) + def test_cast_tensor_single_elem(self): + with torch._dynamo.config.patch({"capture_scalar_outputs": True}): + for t, val in [ + (float, 1.0), + (float, 1), + (float, True), + (int, 1), + (int, False), + # (int, 1.0), # fails due to a >= 0 comparison in sym_int + ]: # , bool, complex]: no casting for sym_bool, no sym_complex + + def fn(x): + x = x + 1 + return t(x) + + opt_fn = torch.compile( + fn, backend="eager", fullgraph=True, dynamic=False + ) + x = torch.tensor([val]) + res = fn(x) + ref = opt_fn(x) + self.assertEqual(ref, res) + + # Cannot handle non single-elem + with self.assertRaises(ValueError): + fn(torch.tensor([val] * 2)) + with self.assertRaises(torch._dynamo.exc.TorchRuntimeError): + opt_fn(torch.tensor([val] * 2)) + + def test_set_construction(self): + def fn(x): + y = x.add_(1) + s = set({x}) + s.add(y) + return len(s) + + opt_fn = torch.compile(fn, backend="eager", fullgraph=True) + x = torch.randn(4) + res = fn(x) + ref = opt_fn(x) + self.assertEqual(ref, res) + + def test_is_tensor_tensor(self): + def fn(x, y): + if x is y: + return x * 2 + else: + return x + y + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + x = torch.zeros(2) + y = torch.ones(2) + + self.assertEqual(fn(x, y), fn_opt(x, y)) + self.assertEqual(fn(x, x), fn_opt(x, x)) + + def test_is_mutated_tensor_tensor(self): + def fn(x): + y = x.add_(1) + return x is y + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + z = torch.ones(4) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_mutated_tensor_tensor_across_graph_break(self): + def fn(x): + y = x.add_(1) + cond = x is y + x.add_(1) + # The real tensor values are recovered when graph breaking. + # Hence we recover the invariant. + torch._dynamo.graph_break() + x.add_(1) + return x is y, cond + + fn_opt = torch.compile(backend="eager", dynamic=True)(fn) + + z = torch.ones(4) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_mutated_tensor_tensor(self): + def fn(x): + y = x.add_(1) + return y is x + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + z = torch.ones(4, 1) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_init_in_compile_mutated_tensor_tensor(self): + def fn(x): + z = x.clone() + y = z.add_(1) + return y is z + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + z = torch.ones(4, 1) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_init_in_compile_vmapped_mutated_tensor_tensor(self): + def fn(z): + x = z.clone() + y = torch.vmap(torch.Tensor.acos_)(x) + _ = y is z + return y is x + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + z = torch.ones(4, 1) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_vmapped_mutated_tensor_tensor(self): + def fn(x): + y = torch.vmap(torch.Tensor.acos_)(x) + return y is x + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + z = torch.ones(4, 1) + + self.assertEqual(fn(z), fn_opt(z)) + + def test_is_init_in_compile_vmapped_mutated_tensor_tensor_multi_arg(self): + def fn(y, z): + a = y.clone() + b = z.clone() + + def g(a, b): + return a.acos_(), b.acos_() + + c, d = torch.vmap(g)(a, b) + return a is c is b is d + + fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn) + + y = torch.ones(4, 2) + z = torch.ones(4, 10) + + self.assertEqual(fn(y, z), fn_opt(y, z)) + self.assertEqual(fn(y, y), fn_opt(y, y)) + common_utils.instantiate_parametrized_tests(DefaultsTests) diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py index b3183c029fec7..c9f6df2f7cf81 100644 --- a/test/dynamo/test_modules.py +++ b/test/dynamo/test_modules.py @@ -2221,10 +2221,10 @@ def foo(mod, x): mod = Mod() foo(mod, torch.rand([4])) - self.assertEqual(compiles_without_buffers, 0) + self.assertEqual(compiles_without_buffers, 1) foo(mod, torch.rand([4], dtype=torch.half)) - self.assertEqual(compiles_without_buffers, 1) + self.assertEqual(compiles_without_buffers, 2) class Mod2(Mod): def __setattr__(self, name, value): @@ -2232,7 +2232,7 @@ def __setattr__(self, name, value): foo(Mod2(), torch.rand([4])) # causes two compilations, bc unimplemented custom setattr - self.assertTrue(compiles_without_buffers >= 2) + self.assertTrue(compiles_without_buffers >= 4) def test_unspec_non_inlinable_module(self): mod = UnspecNonInlinableModule() diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py index 9d9c6036239ef..19dcf8b609094 100644 --- a/test/dynamo/test_unspec.py +++ b/test/dynamo/test_unspec.py @@ -343,6 +343,17 @@ def fn(inputs, dim): compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True) self.assertEqual(compl_fn(inputs, dim), fn(inputs, dim)) + # https://github.com/pytorch/pytorch/issues/104812 + def test_argmin_coerces_symint_to_intlist_spec(self): + def fn(x, dim): + # the python arg parser coerces dim into a vector + return torch.amin(x, dim=dim, keepdim=True) + + x = torch.randn(4, 4, 4) + dim = 2 + compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True) + self.assertEqual(compl_fn(x, dim), fn(x, dim)) + def test_exponential(self): def fn(inputs, op_inputs_dict): res = inputs.exponential_(**op_inputs_dict) diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index 29cc6a87212cd..62a423be91e87 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -620,23 +620,6 @@ def f(a): inp = [torch.ones(3, 3, requires_grad=False)] self.verify_aot_autograd(f, inp, test_mutation=True) - def test_input_mutation_resize_smaller(self): - def f(a, b): - a.resize_(2, 2) - return a + b - # tenors that require gradients cannot be resized, so only test requires_grad=False case - inp = [ - torch.ones(3, 3), - torch.ones(2, 2, requires_grad=True), - ] - self.verify_aot_autograd(f, inp, test_mutation=True) - - inp = [ - torch.ones(3, 3), - torch.ones(2, 2), - ] - self.verify_aot_autograd(f, inp, test_mutation=True) - def test_input_mutation_batchnorm(self): def f(inpt, weight, bias, running_mean, running_var): # This is additionally a good test, because the input tensors that we mutate @@ -1885,45 +1868,6 @@ def forward(self, x, y): """At compilation time, graph 1 was compiled under the assumption that input 1 would not require grad, but at runtime this was not the case. This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch.""" # noqa: B950 ) - def test_resize_input(self): - def f(x, y): - y.resize_(4) - y.zero_() - self.assertEqual(x.shape, (4,)) - return y - - # NB: don't use verify_aot_autograd as the inputs get - # mutated and I don't trust verify to do it right - - compiled_f = aot_function(f, nop) - ref_x = torch.randn(0) - ref_out = f(ref_x, ref_x) - - test_x = torch.randn(0) - test_out = compiled_f(test_x, test_x) - - self.assertEqual(ref_out, test_out) - - def test_resize_input_smaller(self): - def f(x, y): - y.resize_(4) - y.zero_() - self.assertEqual(x.shape, (4,)) - return y - - # NB: don't use verify_aot_autograd as the inputs get - # mutated and I don't trust verify to do it right - - compiled_f = aot_function(f, nop) - ref_x = torch.randn(5) - ref_out = f(ref_x, ref_x) - - test_x = torch.randn(5) - test_out = compiled_f(test_x, test_x) - - self.assertEqual(ref_out, test_out) - - def test_custom_autograd(self): class CustomFn(torch.autograd.Function): @staticmethod diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py index 9b0fdaec8ccc5..e3d14b522c534 100644 --- a/test/functorch/test_ops.py +++ b/test/functorch/test_ops.py @@ -318,9 +318,13 @@ def is_inplace(op, variant): vjp_fail = { xfail('tensor_split'), # data_ptr composite compliance + # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.batch_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.instance_norm', decorator=skipIfRocm), # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.layer_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.scaled_dot_product_attention', decorator=skipIfRocm), } @@ -478,10 +482,11 @@ def abs_if_complex(t): xfail('NumpyExpMarkDirtyAutogradFunction'), # TODO: https://github.com/pytorch/pytorch/issues/91280 # https://github.com/pytorch/pytorch/issues/96560 - # ROCm: NotImplementedError decorate('nn.functional.batch_norm', decorator=skipIfRocm), - # ROCm: NotImplementedError + # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.instance_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.layer_norm', decorator=skipIfRocm), # --- Non-Contiguous Failures! --- # This is expected to fail as the operator @@ -1030,8 +1035,11 @@ def test_vmapvjp(self, device, dtype, op): xfail("_native_batch_norm_legit"), # https://github.com/pytorch/pytorch/issues/96560 - # ROCm: NotImplementedError + decorate('nn.functional.batch_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.instance_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.layer_norm', decorator=skipIfRocm), # ---------------------------------------------------------------------- } @@ -1547,6 +1555,8 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents): xfail("native_batch_norm"), xfail("_native_batch_norm_legit"), xfail('native_dropout_backward'), + decorate('linalg.svd', decorator=skipIfRocm), # https://github.com/pytorch/pytorch/issues/97256 + decorate('svd', decorator=skipIfRocm), # Flaky tensor-likes are not close error on ROCm, adjust tolerance? })) @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,)) @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)}) @@ -1771,7 +1781,13 @@ def fn(input, weight, bias): xfail('nn.functional.max_unpool2d'), # contiguous call xfail('to_sparse'), # dispatch key issue - # https://github.com/pytorch/pytorch/issues/96560 + # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.batch_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.instance_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 + decorate('nn.functional.layer_norm', decorator=skipIfRocm), + # https://github.com/pytorch/pytorch/issues/96560 decorate('xlogy', decorator=skipIfRocm), # numerical inconsistencies, look like bugs diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py index a3df80e6eb807..5b1f6c0e7634b 100644 --- a/test/functorch/test_vmap.py +++ b/test/functorch/test_vmap.py @@ -3598,6 +3598,8 @@ def test(): # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.batch_norm', decorator=skipIfRocm), + decorate('nn.functional.instance_norm', decorator=skipIfRocm), + decorate('nn.functional.layer_norm', decorator=skipIfRocm), # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4] xfail('addcdiv'), @@ -3741,6 +3743,8 @@ def test_vmap_exhaustive(self, device, dtype, op): skip('_softmax_backward_data'), # https://github.com/pytorch/pytorch/issues/96560 decorate('nn.functional.batch_norm', decorator=skipIfRocm), + decorate('nn.functional.instance_norm', decorator=skipIfRocm), + decorate('nn.functional.layer_norm', decorator=skipIfRocm), # One or more of the overload doesn't have a Batch rule. xfail('bincount'), diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py index a6fbb2ce6b6ce..3b1ff31c80178 100644 --- a/test/inductor/test_cpu_repro.py +++ b/test/inductor/test_cpu_repro.py @@ -2267,8 +2267,10 @@ def fn(x): metrics.reset() x = torch.randn(1, 32, 16, 68) opt_fn = torch._dynamo.optimize("inductor")(fn) + code = run_and_get_cpp_code(opt_fn, x) self.assertTrue(same(fn(x), opt_fn(x))) - assert metrics.generated_cpp_vec_kernel_count == 2 + # def and use + FileCheck().check_count("cpp_fused", 2, exactly=True).run(code) def test_invalid_index_of_empty_tensor(self): def fn(a): diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index f3405ab2ad511..dc5cb7b722c5e 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -223,7 +223,7 @@ def mm(a, b): @unittest.skipIf(not SM75OrLater, "need sm_75") @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") @parametrize("dynamic", (False,)) - @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS")) + @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) def test_max_autotune_cutlass_backend_regular_mm( self, dynamic: bool, max_autotune_gemm_backends: str @@ -260,7 +260,7 @@ def mm(a, b): @unittest.skipIf(not SM75OrLater, "need sm_75") @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") @parametrize("dynamic", (False,)) - @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS")) + @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) def test_max_autotune_cutlass_backend_mm_bias( self, dynamic: bool, max_autotune_gemm_backends: str @@ -331,7 +331,7 @@ def addmm(x, a, b): @unittest.skipIf(not SM75OrLater, "need sm_75") @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup") @parametrize("dynamic", (False,)) - @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS")) + @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS")) @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()}) def test_max_autotune_cutlass_backend_addmm( self, dynamic, max_autotune_gemm_backends diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py index 8677351a95baf..65978e241e52f 100644 --- a/test/inductor/test_perf.py +++ b/test/inductor/test_perf.py @@ -189,6 +189,24 @@ def f(a, b): inp = (T(10, 10), T(10, 10)) self.assertExpectedInline(count_numel(f, *inp), """400""") + def f(a, b, c): + return torch.cat((a + 1, b + 2, c + 3)) + 10 + + inp = (T(10, 10), T(10, 10), T(10, 10)) + self.assertExpectedInline(count_numel(f, *inp), """600""") + + def f(a, b, c, d, e): + return torch.cat((a + 1, b + 2, c + 3, d + 4, e + 5)) + 10 + + inp = [T(10, 10) for _ in range(5)] + self.assertExpectedInline(count_numel(f, *inp), """2000""") + + def f(a, b): + return torch.cat([a.sum(dim=0), b.sum(dim=0)]) + 10 + + inp = [T(10, 10, 10), T(10, 10, 10)] + self.assertExpectedInline(count_numel(f, *inp), """2600""") + def test_index(self): def f(a, b): return a[b] diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py index ebd4a27386172..5659aee7777fc 100644 --- a/test/inductor/test_torchinductor_opinfo.py +++ b/test/inductor/test_torchinductor_opinfo.py @@ -203,6 +203,8 @@ def format_op(op): "cholesky": {f32, f64}, "complex": {f16}, "exponential": {f16}, + "resize_": {b8, f16, f32, f64, i32, i64}, + "resize_as_": {b8, f16, f32, f64, i32, i64}, "geometric": {f16}, "log_normal": {f16}, "masked_scatter": {f16, f32, f64}, @@ -234,6 +236,8 @@ def format_op(op): "cauchy": {f16}, "cholesky": {f32, f64}, "exponential": {f16}, + "resize_": {b8, f16, f32, f64, i32, i64}, + "resize_as_": {b8, f16, f32, f64, i32, i64}, "geometric": {f16}, "log_normal": {f16}, "masked_scatter": {f16, f32, f64}, diff --git a/test/mobile/custom_build/prepare_model.py b/test/mobile/custom_build/prepare_model.py index 039ec2bec76be..feb044e3bfc03 100644 --- a/test/mobile/custom_build/prepare_model.py +++ b/test/mobile/custom_build/prepare_model.py @@ -5,11 +5,11 @@ """ import torch -import torchvision import yaml +from torchvision import models # Download and trace the model. -model = torchvision.models.mobilenet_v2(pretrained=True) +model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1) model.eval() example = torch.rand(1, 3, 224, 224) traced_script_module = torch.jit.trace(model, example) diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py index 25c4ab15c5d04..e86fe2fdbf948 100644 --- a/test/mobile/model_test/torchvision_models.py +++ b/test/mobile/model_test/torchvision_models.py @@ -1,12 +1,12 @@ import torch -import torchvision from torch.utils.bundled_inputs import augment_model_with_bundled_inputs from torch.utils.mobile_optimizer import optimize_for_mobile +from torchvision import models class MobileNetV2Module: def getModule(self): - model = torchvision.models.mobilenet_v2(pretrained=True) + model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1) model.eval() example = torch.zeros(1, 3, 224, 224) traced_script_module = torch.jit.trace(model, example) @@ -23,7 +23,7 @@ def getModule(self): class MobileNetV2VulkanModule: def getModule(self): - model = torchvision.models.mobilenet_v2(pretrained=True) + model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1) model.eval() example = torch.zeros(1, 3, 224, 224) traced_script_module = torch.jit.trace(model, example) @@ -40,7 +40,7 @@ def getModule(self): class Resnet18Module: def getModule(self): - model = torchvision.models.resnet18(pretrained=True) + model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1) model.eval() example = torch.zeros(1, 3, 224, 224) traced_script_module = torch.jit.trace(model, example) diff --git a/test/onnx/test_fx_to_onnx_with_onnxruntime.py b/test/onnx/test_fx_to_onnx_with_onnxruntime.py index f8439ef210565..a5f90cb61f62d 100644 --- a/test/onnx/test_fx_to_onnx_with_onnxruntime.py +++ b/test/onnx/test_fx_to_onnx_with_onnxruntime.py @@ -261,7 +261,7 @@ def test_resnet18(self): # So we are explicitly calling `model.eval()` for any model that contains # batch norm. # Ref: https://github.com/pytorch/pytorch/issues/99662#issuecomment-1528178221 - model = torchvision.models.resnet18(pretrained=False).eval() + model = torchvision.models.resnet18(weights=None).eval() dummy_input = torch.randn(1, 3, 224, 224) self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime( @@ -276,7 +276,7 @@ def test_resnet18(self): @skip_if_no_torchvision def test_shufflenet_v2(self): # TODO(bowbao): see Note [training vs eval in dynamo_export] - model = torchvision.models.shufflenet_v2_x0_5(pretrained=False).eval() + model = torchvision.models.shufflenet_v2_x0_5(weights=None).eval() dummy_input = torch.randn(1, 3, 224, 224, requires_grad=False) test_inputs = torch.randn(3, 3, 224, 224, requires_grad=False) diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py index b50e8e903c7ba..f56d9cd9f23c2 100644 --- a/test/onnx/test_models.py +++ b/test/onnx/test_models.py @@ -253,7 +253,7 @@ def test_shufflenet(self): def test_fcn(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest( - toC(fcn_resnet101(pretrained=False, pretrained_backbone=False)), + toC(fcn_resnet101(weights=None, weights_backbone=None)), toC(x), rtol=1e-3, atol=1e-5, @@ -263,7 +263,7 @@ def test_fcn(self): def test_deeplab(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest( - toC(deeplabv3_resnet101(pretrained=False, pretrained_backbone=False)), + toC(deeplabv3_resnet101(weights=None, weights_backbone=None)), toC(x), rtol=1e-3, atol=1e-5, diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py index af259b4e1d67a..f9afca487f52b 100644 --- a/test/onnx/test_models_onnxruntime.py +++ b/test/onnx/test_models_onnxruntime.py @@ -420,7 +420,7 @@ def test_mobilenet_v3(self): @skipIfUnsupportedMinOpsetVersion(11) @skipScriptTest() def test_shufflenet_v2_dynamic_axes(self): - model = torchvision.models.shufflenet_v2_x0_5(pretrained=False) + model = torchvision.models.shufflenet_v2_x0_5(weights=None) dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True) test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True) self.run_test( diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 15f1b5abf2d3c..2a57f043c700f 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -1668,7 +1668,7 @@ def forward(self, x): self.assertEqual(len(list(graph.nodes())), 1) def test_fuse_resnet18(self): - model = torchvision.models.resnet18(pretrained=False) + model = torchvision.models.resnet18(weights=None) x = torch.randn(2, 3, 224, 224, requires_grad=True) graph, _, __ = self._model_to_graph( model, diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py index 684ba89eef89b..879c0224b177d 100644 --- a/test/quantization/pt2e/test_quantize_pt2e.py +++ b/test/quantization/pt2e/test_quantize_pt2e.py @@ -811,6 +811,255 @@ def validate(self, model: torch.fx.GraphModule) -> None: m, expected_node_list=node_list, expected_node_occurrence=node_occurrence ) + def _test_transitive_sharing_with_cat_helper(self, quantizer): + m = TestHelperModules.Conv2dWithTwoCat().eval() + example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5), torch.randn(1, 6, 3, 3), torch.randn(1, 6, 3, 3)) + + # program capture + m = capture_pre_autograd_graph( + m, + example_inputs, + ) + m = prepare_pt2e(m, quantizer) + m(*example_inputs) + # make sure the two input observers and output are shared + conv_output_obs = [] + for n in m.graph.nodes: + if n.op == "call_function" and n.target == torch.ops.aten.conv2d.default: + conv_output_obs.append(getattr(m, list(n.users)[0].target)) + if n.op == "call_function" and n.target == torch.ops.aten.cat.default: + inputs = n.args[0] + input0 = inputs[0] + input1 = inputs[1] + assert input0.op == "call_module" + assert input1.op == "call_module" + obs_ins0 = getattr(m, input0.target) + obs_ins1 = getattr(m, input1.target) + assert obs_ins0 == obs_ins1 + + output_obs = list(n.users)[0] + assert output_obs.op == "call_module" + obs_ins2 = getattr(m, output_obs.target) + assert obs_ins0 == obs_ins2, "input observer does not match output" + + assert len(conv_output_obs) == 2, "expecting two observer that follows conv2d ops" + # checking that the output observers for the two convs are shared as well + assert conv_output_obs[0] == conv_output_obs[1] + + m(*example_inputs) + m = convert_pt2e(m, fold_quantize=True) + + node_occurrence = { + # two for input of the first conv, one for output for the first conv + ns.call_function( + torch.ops.quantized_decomposed.quantize_per_tensor.default + ): 7, + ns.call_function( + torch.ops.quantized_decomposed.dequantize_per_tensor.default + ): 9, + } + node_list = [ + ns.call_function( + torch.ops.quantized_decomposed.dequantize_per_tensor.default + ), + ns.call_function( + torch.ops.quantized_decomposed.dequantize_per_tensor.default + ), + ns.call_function(torch.ops.aten.cat.default), + ns.call_function( + torch.ops.quantized_decomposed.quantize_per_tensor.default + ), + ns.call_function( + torch.ops.quantized_decomposed.dequantize_per_tensor.default + ), + ns.call_function(torch.ops.aten.cat.default), + ns.call_function( + torch.ops.quantized_decomposed.quantize_per_tensor.default + ), + ] + self.checkGraphModuleNodes( + m, expected_node_list=node_list, expected_node_occurrence=node_occurrence + ) + + def test_shared_qspec_transitivity(self): + """This tests the transitivity of SharedQuantizationSpec, that is + if A is shared with B, B is shared with C, then C should be shared with A as well + + x1 -> conv1 -> cat1 -----> cat2 + x2 -> conv2 -/ / + x3 -> add / + x4 / + + both cat has shared input and output, and because of cat and (cat1 -> cat2) is the same Tensor + so there is an implicit sharing here, all tensors connect to cat1 and cat2 are in the same + sharing group after transitive sharing + """ + # TODO: refactor this to a common util + class BackendAQuantizer(Quantizer): + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + for node in model.graph.nodes: + if ( + node.op == "call_function" + and node.target == torch.ops.aten.conv2d.default + ): + input_act = node.args[0] + assert isinstance(input_act, Node) + weight = node.args[1] + assert isinstance(weight, Node) + bias = node.args[2] + assert isinstance(bias, Node) + act_qspec = QuantizationSpec( + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_observer, + ) + weight_qspec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_weight_observer, + ) + bias_qspec = QuantizationSpec( + dtype=torch.float32, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.PlaceholderObserver, + ) + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + input_act: act_qspec, + weight: weight_qspec, + bias: bias_qspec, + }, + output_qspec=act_qspec, + _annotated=True, + ) + elif node.target is torch.ops.aten.cat.default: + cat_node = node + input_nodes = cat_node.args[0] + first_input_node = input_nodes[0] + input_qspec_map = {} + act_qspec = QuantizationSpec( + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_observer, + ) + input_qspec_map[first_input_node] = act_qspec + share_qparams_with_input_act0_qspec = SharedQuantizationSpec((first_input_node, cat_node)) + for input_node in input_nodes[1:]: + input_qspec_map[input_node] = share_qparams_with_input_act0_qspec + + cat_node.meta[ + "quantization_annotation" + ] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=share_qparams_with_input_act0_qspec, + _annotated=True, + ) + + def validate(self, model: torch.fx.GraphModule) -> None: + pass + + self._test_transitive_sharing_with_cat_helper(BackendAQuantizer()) + + def test_shared_qspec_transitivity_case_2(self): + """This tests the transitivity of SharedQuantizationSpec, that is + if A is shared with B, B is shared with C, then C should be shared with A as well + + x1 -> conv1 -> cat1 -----> cat2 + x2 -> conv2 -/ / + x3 -> add / + x4 / + + both cat has shared input and output, and because of cat and (cat1 -> cat2) is the same Tensor + so there is an implicit sharing here, all tensors connect to cat1 and cat2 are in the same + sharing group after transitive sharing + + the difference is that for this one, all edges and nodes are shared with the second input edge of cat + instead of the first input edge of cat as in previous example + """ + # TODO: refactor this to a common util + class BackendAQuantizer(Quantizer): + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + for node in model.graph.nodes: + if ( + node.op == "call_function" + and node.target == torch.ops.aten.conv2d.default + ): + input_act = node.args[0] + assert isinstance(input_act, Node) + weight = node.args[1] + assert isinstance(weight, Node) + bias = node.args[2] + assert isinstance(bias, Node) + act_qspec = QuantizationSpec( + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_observer, + ) + weight_qspec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_weight_observer, + ) + bias_qspec = QuantizationSpec( + dtype=torch.float32, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.PlaceholderObserver, + ) + node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map={ + input_act: act_qspec, + weight: weight_qspec, + bias: bias_qspec, + }, + output_qspec=act_qspec, + _annotated=True, + ) + elif node.target is torch.ops.aten.cat.default: + cat_node = node + input_nodes = cat_node.args[0] + first_input_node = input_nodes[0] + second_input_node = input_nodes[1] + input_qspec_map = {} + act_qspec = QuantizationSpec( + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=observer.default_observer, + ) + input_qspec_map[second_input_node] = act_qspec + share_qparams_with_input_act1_qspec = SharedQuantizationSpec((second_input_node, cat_node)) + input_qspec_map[first_input_node] = share_qparams_with_input_act1_qspec + + cat_node.meta[ + "quantization_annotation" + ] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=share_qparams_with_input_act1_qspec, + _annotated=True, + ) + + def validate(self, model: torch.fx.GraphModule) -> None: + pass + + self._test_transitive_sharing_with_cat_helper(BackendAQuantizer()) + def test_int16(self): class Int16ActQuantizer(Quantizer): def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: diff --git a/test/test_cuda.py b/test/test_cuda.py index e265e094f26f1..e4bb3145d56a1 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -80,8 +80,23 @@ def test_pinned_memory_with_cudaregister(self): torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:True,pinned_num_register_threads:8") t = torch.ones(20) self.assertFalse(t.is_pinned()) - pinned_t = torch.ones(1 << 21).pin_memory() - self.assertTrue(pinned_t.is_pinned()) + try: + pinned_t = torch.ones(1 << 21).pin_memory() + self.assertTrue(pinned_t.is_pinned()) + pinned_t = torch.ones(1 << 24).pin_memory() + self.assertTrue(pinned_t.is_pinned()) + except RuntimeError as e: + # Some GPUs don't support same address space on host and device side + pass + + def test_pinned_memory_with_cudaregister_multithread(self): + num_threads = 4 + threads = [threading.Thread(target=self.test_pinned_memory_with_cudaregister) + for t in range(num_threads)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() def test_cudart_register(self): t = torch.ones(20) diff --git a/test/test_jit.py b/test/test_jit.py index 6ffca95e33f1d..0131619f4a9d5 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -142,6 +142,7 @@ import unittest import warnings import zipfile +import tracemalloc def canonical(graph): @@ -12813,7 +12814,7 @@ def some_func(x): x = torch.rand(3, 4) self.assertEqual(some_func(x), x) - def test_file_format_serialization(self): + def _make_filereader_test_file(self): filename = tempfile.mktemp() writer = torch._C.PyTorchFileWriter(filename) buffers = [os.urandom(size) for size in [random.randint(1, 100) for i in range(20)]] @@ -12824,6 +12825,10 @@ def test_file_format_serialization(self): serialized_offsets = pickle.dumps(offsets) writer.write_record("meta", serialized_offsets, len(serialized_offsets)) writer.write_end_of_file() + return filename, buffers, serialized_offsets + + def test_file_format_serialization(self): + filename, buffers, serialized_offsets = self._make_filereader_test_file() reader = torch._C.PyTorchFileReader(filename) serialized_offsets_read = reader.get_record("meta") @@ -12833,6 +12838,29 @@ def test_file_format_serialization(self): data = reader.get_record(str(offset)) assert(data == buffers[i]) + def test_file_reader_no_memory_leak(self): + num_iters = 10000 + filename, _, _ = self._make_filereader_test_file() + + # Load from filename + tracemalloc.start() + for i in range(num_iters): + torch._C.PyTorchFileReader(filename) + _, peak_from_string = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # Load from stream + tracemalloc.start() + with open(filename, 'rb') as f: + for i in range(num_iters): + f.seek(0) + torch._C.PyTorchFileReader(f) + _, peak_from_file = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # Check if the peak sizes at most differ by an empirically obtained factor + assert peak_from_file < peak_from_string * 500 + # for each type, the input type annotation and corresponding return type annotation def type_input_return_pairs(self): return [ diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py index 683003474ba02..7c39d36ec1649 100644 --- a/test/test_mkldnn.py +++ b/test/test_mkldnn.py @@ -1440,12 +1440,12 @@ def _test_imagenet_model(self, model): @skipIfNoTorchVision def test_resnet18(self): - model = torchvision.models.resnet.resnet18(pretrained=False) + model = torchvision.models.resnet.resnet18(weights=None) self._test_imagenet_model(model) @skipIfNoTorchVision def test_resnext50_32x4d(self): - model = torchvision.models.resnet.resnext50_32x4d(pretrained=False) + model = torchvision.models.resnet.resnext50_32x4d(weights=None) self._test_imagenet_model(model) def _lstm_params_list(self): diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index 08b62cc1476ba..d3b04617d2c1b 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -1122,6 +1122,20 @@ def test_isin_different_devices(self, device, dtype): with self.assertRaises(RuntimeError): torch.isin(c, d) + @dtypes(*integral_types()) + def test_sort_overflow(self, device, dtype): + " Regression test for https://github.com/pytorch/pytorch/issues/111189 " + prev_num_threads = torch.get_num_threads() + try: + low = 0 if dtype == torch.uint8 else -1 + x = torch.full((32768,), low, dtype=dtype, device=device) + x[:100] = torch.iinfo(x.dtype).max + torch.set_num_threads(1) + uv = x.sort().values.unique() + self.assertEqual(uv.size(0), 2) + finally: + torch.set_num_threads(prev_num_threads) + instantiate_device_type_tests(TestSortAndSelect, globals()) diff --git a/test/test_torch.py b/test/test_torch.py index f090b6b523e69..2798fee12da26 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6351,6 +6351,11 @@ def test_equal(self): self.assertNotEqual(t_0.size(), t_1.size()) self.assertFalse(torch.equal(t_0, t_1)) + # Fast path: tensor containing `nan` is not equal to self + for dtype in floating_and_complex_types(): + t = torch.tensor([1., float('nan')], dtype=dtype) + self.assertFalse(torch.equal(t, t)) + def test_element_size(self): byte = torch.ByteStorage().element_size() char = torch.CharStorage().element_size() diff --git a/third_party/fbgemm b/third_party/fbgemm index d0ee798b1f198..70c6e83c29f67 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit d0ee798b1f198cc51b6ddae20cf6063f6380ba3f +Subproject commit 70c6e83c29f67278751abd0e28433c50743ccbe9 diff --git a/third_party/gloo b/third_party/gloo index 2cbcef29a6aff..cf1e1abc95d0b 160000 --- a/third_party/gloo +++ b/third_party/gloo @@ -1 +1 @@ -Subproject commit 2cbcef29a6aff241896a86c719195f1757bfd1b8 +Subproject commit cf1e1abc95d0b961222ee82b6935f76250fbcf16 diff --git a/third_party/kineto b/third_party/kineto index 5d3c309049e30..49e854d805d91 160000 --- a/third_party/kineto +++ b/third_party/kineto @@ -1 +1 @@ -Subproject commit 5d3c309049e30193bfae03720a2e665f09d65447 +Subproject commit 49e854d805d916b2031e337763928d2f8d2e1fbf diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 9601cc67126ba..f3c577bc531a1 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2260,11 +2260,11 @@ - name: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode) -- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor - self, weight, bias: "grad.defined() ? mps_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple()" +- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor + self, weight, bias: "grad.defined() ? mps_convolution_backward_symint(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple()" -- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) +- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) - name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1) @@ -2286,54 +2286,54 @@ indices: non_differentiable result: auto_linear -- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor +- name: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups) # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution. # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context # by convolution_backward instead of being passed along from the forward pass. -- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +- name: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32) -- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) result2: convolution_backward_jvp_grad_bias(grad_output_t, result2) -- name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor - input, weight, bias: "grad.defined() ? convolution_backward_overrideable(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" +- name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor + input, weight, bias: "grad.defined() ? convolution_backward_overrideable_symint(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" -- name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) +- name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) -- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor +- name: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple()" -- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor +- name: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple()" -- name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor - self, weight, bias: "grad.defined() ? _slow_conv2d_backward(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple()" +- name: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor + self, weight, bias: "grad.defined() ? _slow_conv2d_backward_symint(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple()" -- name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask) +- name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) + grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask) -- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor +- name: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple()" -- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor +- name: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple()" -- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor +- name: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple()" -- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor +- name: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" -- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor +- name: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor @@ -2618,14 +2618,14 @@ - name: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity) -- name: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor +- name: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})" -- name: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor - self, weight: "grad.defined() ? mps_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple()" +- name: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor + self, weight: "grad.defined() ? mps_convolution_transpose_backward_symint(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple()" -- name: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})" +- name: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor + self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})" - name: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output self, grid: "grad.defined() ? cudnn_grid_sampler_backward(self, grid, grad) : std::tuple()" @@ -2655,9 +2655,9 @@ # nnpack -- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor +- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here. - input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector(padding.size(), 1), false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" + input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector(padding.size(), 1), false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" #LSTM MPS - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) @@ -2688,13 +2688,13 @@ # miopen -- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple()" -- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" -- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) @@ -2720,7 +2720,7 @@ - name: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) # mkldnn -- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor diff --git a/torch/_dynamo/allowed_functions.py b/torch/_dynamo/allowed_functions.py index bb45d22236049..5d54b1033efb1 100644 --- a/torch/_dynamo/allowed_functions.py +++ b/torch/_dynamo/allowed_functions.py @@ -120,6 +120,7 @@ def _disallowed_function_ids(): warnings.warn, torch._C._dynamo.eval_frame.unsupported, torch.Tensor.__init__, + torch.resize_as_, ] # extract all dtypes from torch diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py index 20bd96513d681..722adce01f1e1 100644 --- a/torch/_dynamo/bytecode_transformation.py +++ b/torch/_dynamo/bytecode_transformation.py @@ -819,7 +819,7 @@ def maybe_pop_n(n): if output and output[-1].opcode == dis.EXTENDED_ARG: output.pop() - for i, inst in enumerate(instructions): + for inst in instructions: if inst.opcode == dis.EXTENDED_ARG: # Leave this instruction alone for now so we never shrink code inst.arg = 0 diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py index f6de25657bc3b..0af74c8101ddf 100644 --- a/torch/_dynamo/codegen.py +++ b/torch/_dynamo/codegen.py @@ -25,9 +25,9 @@ NumpyNdarrayVariable, SymNodeVariable, TensorVariable, - TensorWithTFOverrideVariable, UnspecializedPythonVariable, ) +from .variables.torch_function import TensorWithTFOverrideVariable @dataclasses.dataclass diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py index 2ddc2407fb0c8..e8ec24c43813d 100644 --- a/torch/_dynamo/guards.py +++ b/torch/_dynamo/guards.py @@ -980,7 +980,7 @@ def source_ref(source): ): continue - guard.create(builder, builder) + guard.create(builder) self.check_fn = self.compile_check_fn(builder, guards, guard_fail_fn) self._weakrefs.clear() # Keep track of weak references of objects with ID_MATCH guard. This @@ -1004,6 +1004,7 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn): code_parts = ["___guarded_code.valid", "___check_global_state()"] def add_code_part(code, guard, log_only=False): + extra = "" if guard.user_stack: for fs in reversed(guard.user_stack): if fs.filename not in uninteresting_files(): @@ -1013,7 +1014,7 @@ def add_code_part(code, guard, log_only=False): elif guard.stack: extra = f" # {format_frame(guard.stack.summary()[-1])}" - guards_log.debug("%s", f"{code:<60}{extra}") + guards_log.debug("%s", f"{code:<60}{extra}") if verbose_guards_log.isEnabledFor(logging.DEBUG): maybe_stack = "" diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py index f0352312a5daa..6d4bd6bb1a8b8 100644 --- a/torch/_dynamo/resume_execution.py +++ b/torch/_dynamo/resume_execution.py @@ -519,7 +519,7 @@ def remap_block_offsets( # to determine where in the original code the PUSH_EXC_INFO offset # replaced. prefix_blocks = [] - for idx, inst in enumerate(instructions): + for inst in instructions: if len(prefix_blocks) == len( meta.prefix_block_target_offset_remap ): diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py index 52bd1f906295e..068345f694fd9 100644 --- a/torch/_dynamo/skipfiles.py +++ b/torch/_dynamo/skipfiles.py @@ -64,10 +64,10 @@ * BUILTIN_SKIPLIST contains builtin python modules, such as abc, collections, etc. * THIRDPARTY_SKIPLIST contains common third party libraries, such as numpy, pandas, etc. * Functions in these two SKIPLISTs are always skipped, except when they are explicitly - put into the three INLINELIST: FUNC_INLINELIST, FILE_INLINELIST and SUBMODULE_INLINELIST. + put into the two INLINELIST: FUNC_INLINELIST and MOD_INLINELIST. * PyTorch(torch) is in the BUILTIN_SKIPLIST by default, but there are many cases where we want inline the functions under torch namespace. We should add them - into one of the three *_INLINELIST to make dynamo inline those functions. + into one of the two *_INLINELIST to make dynamo inline those functions. * If you call functions under skipped modules/files, Dynamo will wrap these functions as SkipFilesVariable. There are a few functions(e.g, collections.OrderedDict) that we have special handling at SkipFilesVariable.call_function. @@ -76,17 +76,14 @@ To figure out what the behavior is, check the following list in order: * FUNC_INLINELIST (Inline if YES) -* FILE_INLINELIST (Inline if YES) -* SUBMODULE_INLINELIST (Inline if YES) +* MOD_INLINELIST (Inline if YES) * BUILTIN_SKIPLIST & THIRDPARTY_SKIPLIST (Skip if YES) * Inline by default In general, if you want to force inline a function or module, please consider adding -the function's file or python module to FILE_INLINELIST first. -Use the FUNC_INLINELIST only when there are other functions under the same file that -you don't want to inline. -In the future, we will consolidate FILE_INLINELIST and SUBMODULE_INLINELIST into one list -as we use the same logic (filename.startswith) to determine if a file or module is skipped. +the function's python module to MOD_INLINELIST first. +Use the FUNC_INLINELIST only when there are other functions under the same module that +you don't want to inline them. """ @@ -114,7 +111,7 @@ tempfile, threading, tokenize, - torch, # torch/* is skipped by default unless specified in FILE_INLINELIST or SUBMODULE_INLINELIST + torch, # torch/* is skipped by default unless specified in FUNC_INLINELIST or MOD_INLINELIST traceback, types, typing, @@ -165,36 +162,25 @@ def _module_dir(m: types.ModuleType): } -# Force inline functions in these files or directories, even they are in *_SKIPLIST. -# We are using python module name instead of file or directory object to avoid circular dependency. -# Please keep this sorted alphabetically. -# TODO: Merge FILE_INLINELIST into SUBMODULE_INLINELIST. -FILE_INLINELIST = { - "torch._dynamo._trace_wrapped_higher_order_op", - "torch._dynamo.comptime", +# These are legacy workarounds, don't add new modules to this list. +# Please use the MOD_INLINELIST instead to force inline functions under particular modules. +LEGACY_MOD_INLINELIST = { "torch._dynamo.external_utils", - "torch._dynamo.polyfill", "torch._export.db.examples", "torch._export.wrappers", "torch._functorch.apis", "torch._functorch.deprecated", "torch._higher_order_ops.cond", - "torch._inductor.test_operators", "torch.ao.quantization.pt2e.eval_utils", "torch.ao.quantization.pt2e.qat_utils", "torch.ao.quantization.pt2e.representation.rewrite", "torch.ao.quantization.pt2e.utils", "torch.ao.quantization.quantizer.xnnpack_quantizer", - "torch.nn.modules.container", - "torch.optim._functional", - "torch.random", - "torch.utils._content_store", - "torch.utils._foreach_utils", + "torch.optim", } - if torch.distributed.is_available(): - FILE_INLINELIST |= { + LEGACY_MOD_INLINELIST |= { "torch.distributed._tensor.api", "torch.distributed._tensor.device_mesh", "torch.distributed.algorithms._checkpoint.checkpoint_wrapper", @@ -203,31 +189,35 @@ def _module_dir(m: types.ModuleType): "torch.distributed.tensor.parallel.style", } -# Include optimizer code for tracing -FILE_INLINELIST |= { - str(obj.__module__) for obj in torch.optim.__dict__.values() if inspect.isclass(obj) -} -# TODO: consolidate SUBMODULE_INLINELIST and FILE_INLINELIST into one list -# Force inline functions under these modules, even the modules is in *_SKIPLIST. -SUBMODULE_INLINELIST = { +# Force inline functions under these modules, even they are in *_SKIPLIST. +# We are using python module name instead of file or directory object to avoid circular dependency. +# Please keep this sorted alphabetically. +MOD_INLINELIST = { "torch._refs", "torch._prims", "torch._decomp", + "torch._dynamo._trace_wrapped_higher_order_op", + "torch._dynamo.comptime", + "torch._dynamo.polyfill", + "torch._inductor.test_operators", "torch.ao.nn", "torch.distributions", "torch.fx._pytree", "torch.nn", + "torch.random", "torch.sparse", "torch.testing", + "torch.utils._content_store", "torch.utils._contextlib", + "torch.utils._foreach_utils", "torch.utils._pytree", } if torch.distributed.is_available(): - SUBMODULE_INLINELIST.add("torch.distributed") - SUBMODULE_INLINELIST.add("torch.distributed._functional_collectives") + MOD_INLINELIST.add("torch.distributed") + MOD_INLINELIST.add("torch.distributed._functional_collectives") # TODO: support adding bound method into this list @@ -243,17 +233,17 @@ def get_func_inlinelist(): @functools.lru_cache(None) -def get_file_inlinelist(): +def get_legacy_mod_inlinelist(): inlinelist = set() - for f in FILE_INLINELIST: - inlinelist.add(_module_dir(torch) + f[len("torch.") :].replace(".", "/")) + for m in LEGACY_MOD_INLINELIST: + inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/")) return inlinelist @functools.lru_cache(None) -def get_submodule_inlinelist(): +def get_mod_inlinelist(): inlinelist = set() - for m in SUBMODULE_INLINELIST: + for m in MOD_INLINELIST: inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/")) return inlinelist @@ -303,21 +293,19 @@ class SkipResult: reason: Optional[str] -# TODO(ybliang): This is a temp function, we should consolidate this with check_file. -def _check_file_inner(filename, allow_torch=False): +def check_file(filename, allow_torch=False): """Should skip this file?""" if filename is None: return SkipResult(True, "filename is None") - if any(filename.startswith(d) for d in get_file_inlinelist()): + if any(filename.startswith(d) for d in get_legacy_mod_inlinelist()): return SkipResult( False, - "inlined according skipfiles.FILE_INLINELIST", + "inlined according skipfiles.LEGACY_MOD_INLINELIST", ) - # TODO(ybliang): the is_torch check should be consolidate with is_torch_inline_allowed - if allow_torch and is_torch(filename): + if allow_torch and is_torch_inline_allowed(filename): return SkipResult( False, - "inlined according skipfiles.is_torch", + "inlined according skipfiles.MOD_INLINELIST", ) if is_fbcode and bool(FBCODE_SKIP_DIRS_RE.match(filename)): return SkipResult( @@ -330,17 +318,6 @@ def _check_file_inner(filename, allow_torch=False): return SkipResult(False, "inlined by default") -def check_file(filename, allow_torch=False, extra_check=False): - result = _check_file_inner(filename, allow_torch) - if extra_check and result.skipped and is_torch_inline_allowed(filename): - return SkipResult( - False, - "inlined according skipfiles.is_torch_inline_allowed returning True", - ) - else: - return result - - """ This is the main entry point to determine whether an object (function) should be inlined or skipped. Let's illustrate the logic with an example: @@ -366,10 +343,13 @@ def f3(x, y): * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again and the call site is in catch_errors_wrapper.catch_errors of eval_frame.py. * For global variables and function arguments, Dynamo needs to decide if they are wrapped as SkipFilesVariable in builder.py. + +allow_torch is used to indicate whether we are checking the MOD_INLINELIST (torch modules), we only do this check when +f2 is not skipped. """ -def check_verbose(obj, allow_torch=False, extra_check=False): +def check_verbose(obj, allow_torch=False): if isinstance( obj, (UserFunctionVariable, UserMethodVariable, NestedUserFunctionVariable) ): @@ -387,11 +367,11 @@ def check_verbose(obj, allow_torch=False, extra_check=False): False, "inlined according skipfiles.FUNC_INLINELIST", ) - return check_file(filename, allow_torch, extra_check) + return check_file(filename, allow_torch) -def check(obj, allow_torch=False, extra_check=False): - return check_verbose(obj, allow_torch, extra_check).skipped +def check(obj, allow_torch=False): + return check_verbose(obj, allow_torch).skipped # skip common third party libs @@ -402,7 +382,7 @@ def check(obj, allow_torch=False, extra_check=False): def is_torch_inline_allowed(filename): - return any(filename.startswith(d) for d in get_submodule_inlinelist()) + return any(filename.startswith(d) for d in get_mod_inlinelist()) @functools.lru_cache(None) diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py index 148e7fa202d34..404c4b8972f90 100644 --- a/torch/_dynamo/symbolic_convert.py +++ b/torch/_dynamo/symbolic_convert.py @@ -1189,6 +1189,21 @@ def CALL_FUNCTION_KW(self, inst): assert len(kwargs) == len(argnames) self.call_function(fn, args, kwargs) + def LOAD_METHOD_SUPER(self, inst): + self.CALL_FUNCTION(dataclasses.replace(inst, argval=2)) + arg = inst.argval[0] + argval = self.code_options["co_names"][arg] + if sys.version_info < (3, 11): + self.LOAD_ATTR(dataclasses.replace(inst, argval=argval)) + else: + self.LOAD_METHOD(dataclasses.replace(inst, argval=argval)) + + def LOAD_ATTR_SUPER(self, inst): + self.CALL_FUNCTION(dataclasses.replace(inst, argval=2)) + arg = inst.argval[0] + argval = self.code_options["co_names"][arg] + self.LOAD_ATTR(dataclasses.replace(inst, argval=argval)) + def LOAD_METHOD(self, inst): self.LOAD_ATTR(inst) obj = self.pop() @@ -2242,7 +2257,7 @@ def check_inlineable(func): except NotImplementedError: pass # closures - result = skipfiles.check_verbose(func, extra_check=True) + result = skipfiles.check_verbose(func, allow_torch=True) if result.skipped: from torch._dynamo.variables.misc import ( produce_trampoline_autograd_apply, diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index 5eaa3d022a26c..88b33fa28c8c2 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -1337,6 +1337,15 @@ def get_debug_dir(): return _get_debug_dir(debug_root) +def extract_fake_example_value(node, required=True): + if "example_value" in node.meta and is_fake(node.meta["example_value"]): + return node.meta["example_value"] + elif required: + unimplemented("`FakeTensor` example value was required but not available") + else: + return None + + def get_fake_value(node, tx): """ Run the computation represented by `node` using fake tensors and return the result. @@ -1351,6 +1360,10 @@ def get_fake_value(node, tx): op = node.op + # FX Node should always return the same value + if "example_value" in node.meta and is_fake(node.meta["example_value"]): + return node.meta["example_value"] + def fake_wrapper(e): if isinstance(e, torch.Tensor): assert is_fake(e) @@ -2184,7 +2197,7 @@ def is_tensor_base_attr_getter(value): def has_torch_function(vt: "torch._dynamo.variables.base.VariableTracker") -> bool: from torch._dynamo.variables import UserDefinedObjectVariable - from torch._dynamo.variables.tensor import TensorWithTFOverrideVariable + from torch._dynamo.variables.torch_function import TensorWithTFOverrideVariable return isinstance(vt, TensorWithTFOverrideVariable) or ( isinstance(vt, UserDefinedObjectVariable) diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py index 9acd49790d48d..7962d52e988c7 100644 --- a/torch/_dynamo/variables/builder.py +++ b/torch/_dynamo/variables/builder.py @@ -133,10 +133,10 @@ SymNodeVariable, TensorSubclassVariable, TensorVariable, - TensorWithTFOverrideVariable, UnspecializedPythonVariable, ) from .torch import tensor_dunder_fns, torch_special_class_types, TorchVariable +from .torch_function import TensorWithTFOverrideVariable from .user_defined import ( KeyedJaggedTensorVariable, UserDefinedClassVariable, @@ -727,6 +727,7 @@ def index_source(key): istype(value, (type, types.FunctionType)) and skipfiles.check(value, allow_torch=True) and not inspect.getattr_static(value, "_torchdynamo_inline", False) + and not inspect.getattr_static(value, "__script_if_tracing_wrapper", False) ): return SkipFilesVariable( value, diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py index 761434cc137c4..7fb525f9c9863 100644 --- a/torch/_dynamo/variables/builtin.py +++ b/torch/_dynamo/variables/builtin.py @@ -27,6 +27,7 @@ check_constant_args, check_numpy_ndarray_args, check_unspec_python_args, + extract_fake_example_value, get_fake_value, guard_if_dyn, is_utils_checkpoint, @@ -575,14 +576,20 @@ def call_function( # Handle cases like int(torch.seed()) # Also handle sym_float to sym_int cases - if self.fn in (int, float) and isinstance(args[0], SymNodeVariable): + if self.fn in (int, float) and isinstance( + args[0], (SymNodeVariable, variables.TensorVariable) + ): + if isinstance(args[0], variables.TensorVariable): + item = args[0].call_method(tx, "item", [], {}) + else: + item = args[0] fn_ = sym_int if self.fn is int else sym_float out = wrap_fx_proxy( tx=tx, proxy=tx.output.create_proxy( "call_function", fn_, - (args[0].as_proxy(),), + (item.as_proxy(),), {}, ), **options, @@ -1425,6 +1432,12 @@ def _unimplemented(): if isinstance(left, TensorVariable): from .builder import wrap_fx_proxy_cls + if op is operator.is_ and isinstance(right, TensorVariable): + return ConstantVariable.create( + id(extract_fake_example_value(left.as_proxy().node)) + == id(extract_fake_example_value(right.as_proxy().node)) + ) + if op not in supported_tensor_comparison_ops.values(): _unimplemented() if ( @@ -1466,7 +1479,6 @@ def _unimplemented(): right, UserDefinedObjectVariable ): return ConstantVariable.create(op(left.value, right.value)) - if op.__name__ == "is_": # If the two objects are of different type, we can safely return False if type(left) is not type(right): diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py index 095c046a1f1fd..9d1d8675ff9a4 100644 --- a/torch/_dynamo/variables/lists.py +++ b/torch/_dynamo/variables/lists.py @@ -805,8 +805,12 @@ def _as_set_element(self, vt): assert isinstance(vt, VariableTracker) if isinstance(vt, TensorVariable): - tensor_node = vt.as_proxy().node - return SetVariable.SetElement(vt, tensor_node) + fake_tensor = vt.as_proxy().node.meta.get("example_value") + if fake_tensor is None: + unimplemented( + "Cannot check Tensor object identity without its fake value" + ) + return SetVariable.SetElement(vt, fake_tensor) if isinstance(vt, ConstantVariable): return SetVariable.SetElement(vt, vt.value) diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py index 5bb16debf35ca..cd5367f64b907 100644 --- a/torch/_dynamo/variables/tensor.py +++ b/torch/_dynamo/variables/tensor.py @@ -589,37 +589,8 @@ def has_bool_key(v): ) return ConstantVariable.create(None, **options) elif name in ("resize_", "resize_as_"): - if "memory_format" in kwargs: - memory_format = kwargs["memory_format"].as_python_constant() - else: - memory_format = torch.contiguous_format - - if name == "resize_": - self.size = args[0].as_python_constant() - self.is_contiguous = (memory_format,) - else: - assert isinstance(args[0], TensorVariable) - if self.size and args[0].size: - if ( - self.size == args[0].size - or memory_format is torch.preserve_format - ): - self.is_contiguous = args[0].is_contiguous - else: - self.size = args[0].size - self.stride = args[0].stride - self.ndim = args[0].ndim - self.is_contiguous = (memory_format,) - - return wrap_fx_proxy( - tx, - tx.output.create_proxy( - "call_method", - name, - *proxy_args_kwargs([self] + list(args), kwargs), - ), - **options, - ) + # Handling resizing in its full generality is difficult. + unimplemented(f"Tensor.{name}") elif ( name == "add_" and len(args) == 1 and len(kwargs) == 1 and "alpha" in kwargs ): @@ -845,95 +816,6 @@ def call_method( ) -class TensorWithTFOverrideVariable(VariableTracker): - """ - Represents a tensor subclass instance with a __torch_function__ override. - """ - - @staticmethod - def create( - tx, - tensor_variable, - torch_function_fn, - subclass_type, - **kwargs, - ): - var = TensorWithTFOverrideVariable( - tensor_variable, - torch_function_fn, - subclass_type, - **kwargs, - ) - # stash the subclass type to rewrap an output tensor if needed - if var.global_mangled_class_name() not in tx.output.global_scope: - tx.output.install_global(var.global_mangled_class_name(), subclass_type) - - return var - - def __init__( - self, - tensor_variable, - torch_function_fn, - subclass_type, - **kwargs, - ): - super().__init__(**kwargs) - self.tensor_variable = tensor_variable - self.torch_function_fn = torch_function_fn - self.subclass_type = subclass_type - - def as_proxy(self): - return self.tensor_variable.as_proxy() - - def python_type(self): - return self.subclass_type - - def subclass_type_var(self): - from ..source import GlobalSource - from .user_defined import UserDefinedClassVariable - - return UserDefinedClassVariable( - self.subclass_type, source=GlobalSource(self.global_mangled_class_name()) - ) - - def global_mangled_class_name(self): - return f"__subclass_{self.subclass_type.__name__}_{id(self.subclass_type)}" - - def call_torch_function(self, tx, fn, types, args, kwargs): - from .torch_function import call_torch_function - - return call_torch_function( - tx, - self.subclass_type_var(), - self.torch_function_fn, - fn, - types, - args, - kwargs, - ) - - def call_method( - self, - tx, - name, - args: "List[VariableTracker]", - kwargs: "Dict[str, VariableTracker]", - ) -> "VariableTracker": - # This code block implements inlining the __torch_function__ override - # of `call_method`. - if tx.output.torch_function_enabled: - import torch - from .builder import SourcelessBuilder - from .torch_function import dispatch_torch_function - - # [Note: __torch_function__] Currently we only support methods that are defined on tensor - # we will graph break in other cases this will need a bigger overhaul of extracting methods/comparing them for equality - func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name)) - return dispatch_torch_function(tx, func_var, [self] + args, kwargs) - else: - return self.tensor_variable.call_method(tx, name, args, kwargs) - - class NumpyNdarrayVariable(TensorVariable): """ Represents an np.ndarray, but backed by torch Tensor via torch._numpy.ndarray. @@ -1093,6 +975,7 @@ def call_function( ) -> VariableTracker: if len(args) == 1 and isinstance(args[0], TensorVariable): from .builder import VariableBuilder + from .torch_function import TensorWithTFOverrideVariable torch_fn = VariableBuilder( tx, AttrSource(self.source, "__torch_function__") diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py index cba3a86ac59da..dc71535745daa 100644 --- a/torch/_dynamo/variables/torch.py +++ b/torch/_dynamo/variables/torch.py @@ -41,8 +41,11 @@ from .distributed import is_constant_pg_functions, is_from_local, ProcessGroupVariable from .higher_order_ops import TorchHigherOrderOperatorVariable from .lists import ListVariable, TupleVariable -from .tensor import TensorWithTFOverrideVariable -from .torch_function import can_dispatch_torch_function, dispatch_torch_function +from .torch_function import ( + can_dispatch_torch_function, + dispatch_torch_function, + TensorWithTFOverrideVariable, +) log = logging.getLogger(__name__) diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py index c317e39d682ec..a54907a71697a 100644 --- a/torch/_dynamo/variables/torch_function.py +++ b/torch/_dynamo/variables/torch_function.py @@ -2,6 +2,7 @@ from torch.utils._pytree import tree_flatten from ..exc import unimplemented from ..utils import is_tensor_base_attr_getter +from .base import VariableTracker from .constant import ConstantVariable from .lists import TupleVariable @@ -48,8 +49,6 @@ def call_torch_function( def can_dispatch_torch_function(tx, args, kwargs): - from .tensor import TensorWithTFOverrideVariable - if tx.output.torch_function_enabled: all_args = tree_flatten(args)[0] + tree_flatten(kwargs)[0] return any(isinstance(arg, TensorWithTFOverrideVariable) for arg in all_args) @@ -59,7 +58,6 @@ def can_dispatch_torch_function(tx, args, kwargs): def dispatch_torch_function(tx, fn, args, kwargs): """Gathers all args that are TensorWithTFOverrideVariable and dispatches based on the ordering in _get_overloaded_args""" - from .tensor import TensorWithTFOverrideVariable all_args = tree_flatten(args)[0] + tree_flatten(kwargs)[0] overloaded_args = _get_overloaded_args( @@ -82,3 +80,89 @@ def dispatch_torch_function(tx, fn, args, kwargs): unimplemented( f"All __torch_function__ overrides for call {fn} with args {args} and kwargs {kwargs} returned NotImplemented" ) + + +class TensorWithTFOverrideVariable(VariableTracker): + """ + Represents a tensor subclass instance with a __torch_function__ override. + """ + + @staticmethod + def create( + tx, + tensor_variable, + torch_function_fn, + subclass_type, + **kwargs, + ): + var = TensorWithTFOverrideVariable( + tensor_variable, + torch_function_fn, + subclass_type, + **kwargs, + ) + # stash the subclass type to rewrap an output tensor if needed + if var.global_mangled_class_name() not in tx.output.global_scope: + tx.output.install_global(var.global_mangled_class_name(), subclass_type) + + return var + + def __init__( + self, + tensor_variable, + torch_function_fn, + subclass_type, + **kwargs, + ): + super().__init__(**kwargs) + self.tensor_variable = tensor_variable + self.torch_function_fn = torch_function_fn + self.subclass_type = subclass_type + + def as_proxy(self): + return self.tensor_variable.as_proxy() + + def python_type(self): + return self.subclass_type + + def subclass_type_var(self): + from ..source import GlobalSource + from .user_defined import UserDefinedClassVariable + + return UserDefinedClassVariable( + self.subclass_type, source=GlobalSource(self.global_mangled_class_name()) + ) + + def global_mangled_class_name(self): + return f"__subclass_{self.subclass_type.__name__}_{id(self.subclass_type)}" + + def call_torch_function(self, tx, fn, types, args, kwargs): + return call_torch_function( + tx, + self.subclass_type_var(), + self.torch_function_fn, + fn, + types, + args, + kwargs, + ) + + def call_method( + self, + tx, + name, + args: "List[VariableTracker]", + kwargs: "Dict[str, VariableTracker]", + ) -> "VariableTracker": + # This code block implements inlining the __torch_function__ override + # of `call_method`. + if tx.output.torch_function_enabled: + import torch + from .builder import SourcelessBuilder + + # [Note: __torch_function__] Currently we only support methods that are defined on tensor + # we will graph break in other cases this will need a bigger overhaul of extracting methods/comparing them for equality + func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name)) + return dispatch_torch_function(tx, func_var, [self] + args, kwargs) + else: + return self.tensor_variable.call_method(tx, name, args, kwargs) diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py index 541a6e1600450..82d897f4596f2 100644 --- a/torch/_functorch/aot_autograd.py +++ b/torch/_functorch/aot_autograd.py @@ -1601,7 +1601,7 @@ def inner_fn(*args): # This is annoying: our joint function needs to be aware of functionalization # (syncing mutated inputs before calling autograd.grad()) # In theory, we could make the autograd engine do this automatically, although that probably isn't any cleaner. - for i, arg in enumerate(args_maybe_cloned): + for arg in args_maybe_cloned: if not isinstance(arg, Tensor): continue sync_functional_tensor(arg) @@ -2999,22 +2999,6 @@ def runtime_wrapper(*args): continue original_inpt = args[inpt_idx] updated_inpt = updated_inputs[i] - # TODO: add better resize_() support for autograd case. - # Check for the case when an input has been resized. - # Note: One important thing to check for is user code that calls inpt.storage().resize_(). - # We can't trace operations on storage into the graph, so we should get dynamo to graph break. - # TODO: handle resize_() on inputs to a larger size. - # This is actually non-trivial to detect, so we should probably just handle it - # (or make dynamo detect). - # We can't just check of original_inpt.storage_size != updated_inpt.storage_size, - # Because the original_inpt might be a view of some larger tensor, - # and updated_inpt is always densely packed. - if not trace_joint and original_inpt.untyped_storage().size() != updated_inpt.untyped_storage().size(): - # It actually isn't enough just to see if the storage sizes are different between old and new inputs. - # If the original input was a slice into some larger storage, the same will not be true for the updated input. - # So before doing the resize_(), we **also** check that functionalization detected a metadata mutation. - if meta.mutates_metadata: - original_inpt.resize_(updated_inpt.size()) if meta.mutates_metadata and not meta.mutates_data: if trace_joint: assert isinstance(updated_inpt, TensorAlias) diff --git a/torch/_guards.py b/torch/_guards.py index 5dc85af1a79c5..e532a32cdd2ac 100644 --- a/torch/_guards.py +++ b/torch/_guards.py @@ -77,29 +77,6 @@ class GuardSource(enum.Enum): LOCAL_FSDP_MODULE = 7 GLOBAL_FSDP_MODULE = 8 - def select(self, locals_, globals_): - # SHAPE_ENV counts as locals, because the guard expressions - # created by shape env can reference f_locals - # - # RANDOM_VALUE counts as locals, because what we do is we run - # Python RNG and assign it to a temporary, and then perform - # guard tests on that temporary - if self in ( - GuardSource.LOCAL, - GuardSource.LOCAL_NN_MODULE, - GuardSource.LOCAL_FSDP_MODULE, - GuardSource.SHAPE_ENV, - GuardSource.RANDOM_VALUE, - ): - return locals_ - if self in ( - GuardSource.GLOBAL, - GuardSource.GLOBAL_NN_MODULE, - GuardSource.GLOBAL_FSDP_MODULE, - ): - return globals_ - raise NotImplementedError(str(self)) - def is_fsdp_module(self) -> bool: return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE) @@ -255,8 +232,8 @@ def __str__(self): output += f" Guarded Class Weakref: {self.guarded_class_weakref}\n" return output - def create(self, local_builder: GuardBuilderBase, global_builder: GuardBuilderBase): - return self.create_fn(self.source.select(local_builder, global_builder), self) + def create(self, builder: GuardBuilderBase): + return self.create_fn(builder, self) def is_nn_module(self): return self.source.is_nn_module() diff --git a/torch/_inductor/bounds.py b/torch/_inductor/bounds.py index 9896c499bc987..82f85b031c7d0 100644 --- a/torch/_inductor/bounds.py +++ b/torch/_inductor/bounds.py @@ -67,9 +67,18 @@ def swap_submodules( subblock = self.loop_body.subblocks[key] # The result within the lambda will reference to the final # set of modules at the end of the for-loop as it stores a reference to it - result[key] = lambda mask, value: self.masked_subblock( - subblock, self._bounds, mask, value, result - ) + + # bind subblock in a function because python lambdas close over by reference + # moving the lambda out of make_fn would close over the reference to subblock, + # so all lambdas would have the same subblock reference that is the final + # subblock in the loop + def make_fn(subblock): + return lambda mask, value: self.masked_subblock( + subblock, self._bounds, mask, value, result + ) + + result[key] = make_fn(subblock) + else: assert "set_indirect" in key idx = int(key[len("set_indirect") :]) diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py index 010f48f52bdb6..f8e81b7cb0621 100644 --- a/torch/_inductor/codegen/wrapper.py +++ b/torch/_inductor/codegen/wrapper.py @@ -6,7 +6,7 @@ import os import re from itertools import count -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import sympy from sympy import Expr @@ -354,6 +354,7 @@ def write_header(self): from torch._inductor.select_algorithm import extern_kernels aten = torch.ops.aten + inductor_ops = torch.ops.inductor assert_size_stride = torch._C._dynamo.guards.assert_size_stride reinterpret_tensor = torch.ops.inductor._reinterpret_tensor async_compile = AsyncCompile() @@ -776,7 +777,7 @@ def define_user_defined_triton_kernel(self, kernel, kwargs): from ..ir import Buffer from .common import SizeArg, TensorArg - signature = [] + signature: List[Union[TensorArg, SizeArg]] = [] constants = {} for key, arg in kwargs.items(): # Not a real argument diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index 6167a7c917bf2..3970c9c6dd63e 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -201,6 +201,9 @@ # how many nodes to allow into a single fusion max_fusion_size = 64 +# max number of inputs to generate cat as a pointwise op with masked laods +max_pointwise_cat_inputs = 4 + # replace small reductions with pointwise, disable with `= 1` unroll_reductions_threshold = 8 diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py index 890d504ffc81a..3e7f31621f8e6 100644 --- a/torch/_inductor/fx_passes/post_grad.py +++ b/torch/_inductor/fx_passes/post_grad.py @@ -35,7 +35,7 @@ register_graph_pattern, stable_topological_sort, ) -from ..utils import decode_device, is_view +from ..utils import decode_device, is_pointwise_use from ..virtualized import V from .group_batch_fusion import group_batch_fusion_post_grad_passes @@ -784,21 +784,6 @@ def view_to_reshape(gm): nd.target = torch.ops.aten.reshape.default -def is_pointwise_use(use): - if not use.op == "call_function": - return False - - if not ( - isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem - ): - return False - - if use.target is operator.getitem or is_view(use.target): - return all(is_pointwise_use(u) for u in use.users) - - return torch.Tag.pointwise in use.target.tags - - def should_prefer_unfused_addmm(match): inp = match.kwargs["inp"] if not inp.meta["val"].is_cuda: diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 0d8ef4f452c72..02113274b48f2 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -732,7 +732,9 @@ def run_node(self, n: torch.fx.Node): if n.op == "call_function": args, kwargs = self.fetch_args_kwargs_from_env(n) origins |= gather_origins(args, kwargs) - with ir.IRNode.current_origins(origins), self.set_current_node(n): + with ir.IRNode.current_origins(origins), self.set_current_node( + n + ), V.set_current_node(n): if ( n.op == "call_function" and n.target is not operator.getitem diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 1f54546bc290e..990ced46c3c3f 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -3238,6 +3238,16 @@ def create(cls, inputs, dim): return kernel + @classmethod + def can_realize_into_without_copy(cls, src): + if isinstance(src, TensorBox): + # unwrap a TensorBox + return cls.can_realize_into_without_copy(src.data) + + return isinstance(src.data.layout, FlexibleLayout) and not isinstance( + src.data, ExternKernelAlloc + ) + @classmethod def realize_into(cls, src, dst): # Attempt to turn this into a ReinterpretView rather than assert. @@ -3255,9 +3265,7 @@ def realize_into(cls, src, dst): src.realize() # ExternKernelAlloc has specific requirements for output layout, should create a copy assert hasattr(src.data, "layout") - if isinstance(src.data.layout, FlexibleLayout) and not isinstance( - src.data, ExternKernelAlloc - ): + if cls.can_realize_into_without_copy(src): src.data.layout = AliasedLayout(dst) return src.data # introduce a copy @@ -3815,6 +3823,33 @@ def __init__(self, x, *constant_args): self.name = V.graph.register_buffer(self) +class AccumulateGrad(ExternKernel): + """ + This needs to be a custom class to handle mutation properly + """ + + kernel = "inductor_ops.accumulate_grad_" + + def codegen(self, wrapper): + (variable, new_grad) = (t.codegen_reference() for t in self.inputs) + wrapper.writeline(f"{self.kernel}({variable}, {new_grad})") + + def should_allocate(self): + return False + + def get_mutation_names(self): + assert isinstance(self.layout, MutationLayout) + return (self.layout.target.get_name(),) + + def __init__(self, variable, new_grad): + super().__init__( + None, + MutationLayout(variable), + self.unwrap_storage([variable, new_grad]), + ) + self.name = V.graph.register_buffer(self) + + class ScatterFallback(ExternKernel): """ This needs to be a custom class to handle mutation properly. diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index 9bc59ea32b7d5..972bfeb255cde 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -48,7 +48,14 @@ validate_ir, View, ) -from .utils import ceildiv, decode_device, is_dynamic, pad_listlike, sympy_product +from .utils import ( + ceildiv, + decode_device, + is_dynamic, + is_pointwise_use, + pad_listlike, + sympy_product, +) from .virtualized import ops, V log = logging.getLogger(__name__) @@ -947,6 +954,76 @@ def as_strided_copy(x, size, stride, storage_offset=None): return clone(result) +def pointwise_cat(inputs, dim=0): + # (inclusive, exclusive) + inputs_ranges: List[Tuple[sympy.Expr, sympy.Expr]] = [] + prev_end = 0 + for inp in inputs: + inputs_ranges.append((prev_end, prev_end + inp.get_size()[dim])) + prev_end = inputs_ranges[-1][-1] + + inputs_loaders = [inp.make_loader() for inp in inputs] + + def inner_fn(idx): + idx_dim = ops.index_expr(idx[dim], torch.int64) + + masks = [] + masked_loads = [] + for i in range(len(inputs)): + start = ( + ops.constant(0, torch.int64) + if i == 0 + else ops.index_expr(inputs_ranges[i][0], torch.int64) + ) + end = ops.index_expr(inputs_ranges[i][1], torch.int64) + + start_cond = ops.ge(idx_dim, start) + end_cond = ops.lt(idx_dim, end) + if i == 0: + mask = end_cond + elif i == len(inputs) - 1: + mask = start_cond + else: + mask = ops.and_(start_cond, end_cond) + + masks.append(mask) + idx_load = list(idx) + + # if we're concatting [4], [2] + # when we index the second tensor for 5 we want to index 5 - 4 + idx_load[dim] -= inputs_ranges[i][0] + + masked_loads.append( + ops.masked( + mask, + lambda: inputs_loaders[i](idx_load), + 0.0, # this value should be unused + ), + ) + + def get_masked_val(i): + if i != len(inputs) - 1: + return ops.where( + masks[i], + masked_loads[i], + get_masked_val(i + 1), + ) + else: + return masked_loads[-1] + + return get_masked_val(0) + + new_size = list(inputs[0].get_size()) + new_size[dim] = inputs_ranges[-1][-1] + + return Pointwise.create( + device=inputs[0].get_device(), + dtype=inputs[0].get_dtype(), + inner_fn=inner_fn, + ranges=new_size, + ) + + @register_lowering(aten.cat) def cat(inputs, dim=0): if all(input.get_dtype() is torch.uint8 for input in inputs): @@ -966,6 +1043,36 @@ def cat(inputs, dim=0): *inputs, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT ) inputs = [to_dtype(inp, dtype) for inp in inputs] + + def should_lower_cat_input(x) -> bool: + # Unrealized inputs will not be storage and layouts, and we dont want to realize + # them in case we want to fuse + if ir.is_storage_and_layout(x): + storage, _ = ir.as_storage_and_layout(x, freeze=False) + return not ir.ConcatKernel.can_realize_into_without_copy(storage) + + if isinstance(x, TensorBox): + if isinstance(x.data, ir.BaseView): + return should_lower_cat_input(x.data.unwrap_view()) + else: + return should_lower_cat_input(x.data) + + if isinstance(x, ir.StorageBox): + return should_lower_cat_input(x.data) + + if isinstance(x, ir.Pointwise): + return True + + return False + + if len(inputs) <= config.max_pointwise_cat_inputs: + pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users) + all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs) + any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs) + + if all_pointwise_inputs or (any_pointwise_inputs and pointwise_uses): + return pointwise_cat(inputs, dim) + return TensorBox(ir.ConcatKernel.create(inputs, dim)) @@ -4214,7 +4321,6 @@ def loader(index, reduction_index): assert len(reduction_index) == len(reduced_idx) if keepdims: assert len(index) == len(size) - assert all(index[i] == 0 for i in reduced_idx) index = [index[i] for i in kept_idx] assert len(index) == len(kept_idx) new_index = [None] * (len(index) + len(reduction_index)) @@ -4864,6 +4970,15 @@ def _realize(x): return clone(x) +@register_lowering(torch.ops.inductor.accumulate_grad_) +def accumulate_grad_(variable, new_grad): + # TODO(jansel): decompose into `variable.grad += new_grad` when variable.grad is defined + variable.realize() + new_grad.realize() + ir.AccumulateGrad(variable, new_grad) + return variable + + @register_lowering(triton_kernel_wrapper_mutation) def triton_kernel_wrap_(*, kernel_idx, grid, kwargs): ir.UserDefinedTritonKernel(kernel_idx=kernel_idx, grid=grid, kernel_args=kwargs) diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index 801f6ab3ac6ea..66ab71646ac11 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -264,6 +264,21 @@ def is_view(op: torch._ops.OpOverload): return any(a.alias_info is not None for a in op._schema.arguments) +def is_pointwise_use(use): + if not use.op == "call_function": + return False + + if not ( + isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem + ): + return False + + if use.target is operator.getitem or is_view(use.target): + return all(is_pointwise_use(u) for u in use.users) + + return torch.Tag.pointwise in use.target.tags + + def gen_gm_and_inputs(target, args, kwargs): g = torch.fx.Graph() g_args = [] diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py index 6c35928a4bf77..ce6438e52e979 100644 --- a/torch/_inductor/virtualized.py +++ b/torch/_inductor/virtualized.py @@ -173,6 +173,7 @@ def __getattr__(self, item): _debug = Virtualized("debug", NullHandler) _interpreter = Virtualized("interpreter", NullHandler) _aot_compilation = Virtualized("aot_compilation", NullHandler) +_current_node = Virtualized("current_node", NullHandler) class OpsValue: @@ -281,6 +282,8 @@ class _V: set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler set_aot_compilation: Callable[[Any], Any] = _aot_compilation._set_handler get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler + set_current_node: Callable[[Any], Any] = _current_node._set_handler + get_current_node: Callable[[], Any] = _current_node._get_handler @property def ops(self) -> _MockHandler: @@ -319,5 +322,9 @@ def interpreter(self): def aot_compilation(self): return _aot_compilation._get_handler() + @property + def current_node(self): + return _current_node._get_handler() + V = _V() diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index d4e1d68091a63..c20ccc0f7aeaf 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -352,9 +352,10 @@ def meta_copy_(self, src, non_blocking=False): "more than one element of the written-to tensor refers to a single memory location" ) - intermediate = src.to(self, non_blocking) - if self.size() != intermediate.size(): - aten.expand_copy.default(intermediate, self.size()) + if isinstance(src, Tensor): + intermediate = src.to(self, non_blocking) + if self.size() != intermediate.size(): + aten.expand_copy.default(intermediate, self.size()) return self diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py index 8e04338446dab..5463862aa1cd4 100644 --- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py +++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py @@ -252,7 +252,7 @@ def _generate_channels_table( if len(channel_features) > 0: # now we add all channel data - for index, module_fqn in enumerate(filtered_data): + for module_fqn in filtered_data: # we iterate over all channels for channel in range(num_channels): # we make a new row for the channel @@ -644,7 +644,7 @@ def generate_histogram_visualization(self, feature_filter: str, module_fqn_filte # set the legend as well # combine all the data all_data = [] - for index, channel_info in enumerate(y_data): + for channel_info in y_data: all_data.extend(channel_info) val, bins, _ = plt.hist( diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py index bae416b6394e0..32ed0b4bd8b1a 100644 --- a/torch/ao/quantization/pt2e/prepare.py +++ b/torch/ao/quantization/pt2e/prepare.py @@ -5,13 +5,13 @@ _get_output_act_obs_or_fq, _get_dtype_and_is_dynamic, _insert_obs_or_fq, - _maybe_insert_output_observer_for_node, _save_state, _is_activation_post_process_node, - _get_qspec_for_arg, + _create_obs_or_fq_from_qspec, ) from torch.fx import ( GraphModule, + Graph, Node, ) from torch.fx.node import Argument @@ -19,14 +19,217 @@ from torch.ao.quantization import QConfigMapping from torch.ao.quantization.qconfig import QConfigAny from torch.ao.quantization.fx.custom_config import PrepareCustomConfig -from typing import Dict, Tuple, Union, Any +from typing import Dict, Tuple, Union, Any, Optional from torch.ao.quantization.quantizer import ( QuantizationAnnotation, EdgeOrNode, SharedQuantizationSpec, + QuantizationSpecBase, ) from torch.ao.quantization import ObserverOrFakeQuantize +# TODO: make pt2e folder private? +__all__ = [ + "prepare", +] + +def _find_root(edge_or_node: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> EdgeOrNode: + """Find the root node for the sharing tree + Args: + edge_or_node: edge/node that we want to find the root + shared_with_map: each edge/node points to the parent, the root node will points to itself + + Returns: + root edge/node + """ + parent = shared_with_map[edge_or_node] + if parent == edge_or_node: + return edge_or_node + root = _find_root(parent, shared_with_map) + # path compression + shared_with_map[edge_or_node] = root + return root + +def _union(parent: EdgeOrNode, child: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> None: + """Merge the subtree for `child` with `parent`, the order is important here + """ + root_parent = _find_root(parent, shared_with_map) + root_child = _find_root(child, shared_with_map) + # union the two trees by pointing the root of child to root of parent + shared_with_map[root_child] = root_parent + +def _update_shared_with(edge_or_node: EdgeOrNode, qspec: QuantizationSpecBase, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]): + """Update the `shared_with_map` based on the qspec, this applies the `SharedQuantizationSpec` + configuration and established the relationship between `edge_or_node` with the edge/node that it + is pointing to, we'll use this information in the end to get the group id + """ + if isinstance(qspec, SharedQuantizationSpec): + sharing_with = qspec.edge_or_node + # we point from edge_or_node to the node that it is sharing_with, e.g. + # qspec for a = SharedQuantizationSpec(b) means `a` points to `b` + _union(sharing_with, edge_or_node, shared_with_map) + +def _find_root_qspec( + qspec: QuantizationSpecBase, + edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase], + shared_with_map: Dict[EdgeOrNode, EdgeOrNode] +) -> QuantizationSpecBase: + """Unwraps qspec to get the final root qspec (non SharedQuantizationSpec) + if qspec is SharedQuantizationSpec + (1). tries to find the root node for the node that the qspec points to + (2). recursively find the root qspec based on the qspec for the root node + """ + if isinstance(qspec, SharedQuantizationSpec): + sharing_with = qspec.edge_or_node + root = _find_root(sharing_with, shared_with_map) + qspec = edge_or_node_to_qspec[root] + return _find_root_qspec(qspec, edge_or_node_to_qspec, shared_with_map) + return qspec + +def _has_same_dtype(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase): + return ( + hasattr(qspec_a, "dtype") and + hasattr(qspec_b, "dtype") and + qspec_a.dtype == qspec_b.dtype + ) + +def _has_same_is_dynamic(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase): + return ( + hasattr(qspec_a, "is_dynamic") and + hasattr(qspec_b, "is_dynamic") and + qspec_a.is_dynamic == qspec_b.is_dynamic + ) + +def _get_edge_or_node_to_qspec(model: torch.fx.GraphModule) -> Dict[EdgeOrNode, QuantizationSpecBase]: + """Get a map from EdgeOrNode to quantization spec based on annotations on the nodes + """ + edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase] = {} + for n in model.graph.nodes: + if hasattr(n, "meta") and "quantization_annotation" in n.meta: + qa = n.meta["quantization_annotation"] + for input_to_n, qspec in qa.input_qspec_map.items(): + input_edge = (input_to_n, n) + edge_or_node_to_qspec[input_edge] = qspec + if qa.output_qspec is not None: + output_node = n + qspec = qa.output_qspec + edge_or_node_to_qspec[output_node] = qspec + return edge_or_node_to_qspec + +def _get_edge_or_node_to_group_id(edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase]) -> Dict[EdgeOrNode, int]: + """Map from edge/node to the group ID, generated from quantization annotations, + edge/node with the same group ID should use the same observer/fake_quant instance + + This is applying SharedQuantizationSpec configuration and map each edge/node to a group + There is another implicit sharing that's built in the quantization, when we have the following: + * op1 -> op2 + * output of op1: int8_qspec + * (op1 -> op2) input edge: int8_qspec + we'll assume sharing between the output of op1 and input of (op1 -> op2) since these are the same Tensor. + + Figuring out the correct group ID for all edge/node is a standard union find problem: + https://www.geeksforgeeks.org/introduction-to-disjoint-set-data-structure-or-union-find-algorithm/ + + Args: + edge_or_node_to_qspec: Dictionary from edge_or_node to the qspec, derived from annotations + Returns: + edge_or_node_to_group_id: Dictionary from edge_or_node to group_id (int), all edge or node that + belongs to the same group should have the same id + + Example: + op2 -> cat1 -> cat2 + op1 / / + op3 + edge_or_node_to_qspec: { + op1: int8_qspec, + op2: int8_qspec, + (op1, cat1): int8_qspc, + (op2, cat1): SharedQuantizationSpec((op1, cat1)), + cat1: SharedQuantizationSpec((op1, cat1)), + (op3, cat2): int8_qspec, + (cat1, cat2): SharedQuantizationSpec((op3, cat2)), + cat2: SharedQuantizationSpec((op3, cat2)), + } + + edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec) + edge_or_node_to_group_id: { + op1: 1, + op2: 1, + (op1, cat1): 1, + (op2, cat1): 1, + cat1: 1, + (op3, cat2): 1, + (cat1, cat2): 1, + cat2: 1, + } + # everything are in the same group because (cat1) and (cat1, cat2) are implicitly shared, which + # connects the two sharing group around cat1 and cat2 op due to transitive sharing + """ + # means the observer of key should be shared with observer with value, by default it will + # be shared with itself + shared_with_map: Dict[EdgeOrNode, EdgeOrNode] = {k: k for k in edge_or_node_to_qspec.keys()} + for edge_or_node, qspec in edge_or_node_to_qspec.items(): + if isinstance(edge_or_node, torch.fx.Node): + output_node = edge_or_node + _update_shared_with(output_node, qspec, shared_with_map) + else: + input_edge = edge_or_node + input_edge_root = _find_root(input_edge, shared_with_map) + input_edge_root_qspec = edge_or_node_to_qspec[input_edge_root] + input_edge_root_qspec = _find_root_qspec(input_edge_root_qspec, edge_or_node_to_qspec, shared_with_map) + + # find root_qspec for `arg` Node (the output of previous node) + assert isinstance(input_edge, tuple) + arg, n = input_edge + arg_as_output_root_qspec = None + if arg in edge_or_node_to_qspec: + arg_as_output_qspec = edge_or_node_to_qspec[arg] + arg_as_output_root_qspec = _find_root_qspec(arg_as_output_qspec, edge_or_node_to_qspec, shared_with_map) + # TODO: add assertions for types of root qspecs + if ( + arg_as_output_root_qspec is not None and + _has_same_dtype(arg_as_output_root_qspec, input_edge_root_qspec) and + _has_same_is_dynamic(arg_as_output_root_qspec, input_edge_root_qspec) + ): + # the input arg to the node should reuse the existing output observer for arg + # since dtype is the same (we may want to extend this to be a more strict check + # in the future) + # so we point from `input_edge` to `arg` (output of the argument) + _union(arg, input_edge, shared_with_map) + _update_shared_with(input_edge, qspec, shared_with_map) + + # now that we get the sharing relations between all edges and nodes, we can assingn group ids + cur_group_id = 0 + edge_or_node_to_group_id: Dict[EdgeOrNode, int] = {} + for edge_or_node in shared_with_map.keys(): + root = _find_root(edge_or_node, shared_with_map) + if root not in edge_or_node_to_group_id: + edge_or_node_to_group_id[root] = cur_group_id + cur_group_id += 1 + edge_or_node_to_group_id[edge_or_node] = edge_or_node_to_group_id[root] + + return edge_or_node_to_group_id + +def _get_obs_or_fq_map( + edge_or_node_to_group_id: Dict[EdgeOrNode, int], + edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase], + is_qat: bool +) -> Dict[EdgeOrNode, ObserverOrFakeQuantize]: + """Generates the EdgeOrNode to observer/fake_quant instances + Makes sure that for EdgeOrNode that has the same group_id should have the same observer or fake quant + instances + """ + obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {} + group_id_to_obs_or_fq: Dict[int, ObserverOrFakeQuantize] = {} + for edge_or_node, qspec in edge_or_node_to_qspec.items(): + group_id = edge_or_node_to_group_id[edge_or_node] + if group_id not in group_id_to_obs_or_fq: + # TODO: maybe edge_or_node_to_qspec should be edge_or_node_to_root_qspec, this will simplify + # the implementation for _create_obs_or_fq_from_qspec + group_id_to_obs_or_fq[group_id] = _create_obs_or_fq_from_qspec(qspec, obs_or_fq_map, is_qat) + obs_or_fq_map[edge_or_node] = group_id_to_obs_or_fq[group_id] + return obs_or_fq_map + def _maybe_insert_input_observer_for_arg_or_kwarg( node: Union[Node, Any], arg: Argument, @@ -72,21 +275,11 @@ def _maybe_insert_input_observer_for_arg_or_kwarg( observed_arg = arg.args[0] assert isinstance(observed_arg, Node), f"expect observed argument to be a Node, but got: {type(observed_arg)}" assert observed_arg in obs_or_fq_map, \ - f"can't refer to a node that does not have observer/fake_quant inserted yet: {observed_arg}" - input_qspec_map = quantization_annotation.input_qspec_map - input_arg_qspec = _get_qspec_for_arg(arg, input_qspec_map, named_modules) - if isinstance(input_arg_qspec, SharedQuantizationSpec): - # if the argument is set to use SharedQuantizationSpec, we will - # reset the observer instance to align with the configured edge/node - obs_or_fq_name = arg.target - setattr(model, obs_or_fq_name, arg_as_input_act_obs_or_fq) - named_modules[obs_or_fq_name] = arg_as_input_act_obs_or_fq - else: - # otherwise reuse the existing obs/fq - arg_as_input_act_obs_or_fq = obs_or_fq_map[observed_arg] + f"can't find a sharing group for node: {observed_arg}" + # reuse the existing obs/fq + arg_as_input_act_obs_or_fq = obs_or_fq_map[observed_arg] # we don't need to insert new observer node new_arg = arg - obs_or_fq_map[(observed_arg, node)] = arg_as_input_act_obs_or_fq else: # skip inserting new observers if there is an observer inserted for the arg before # that has the same dtype that we want to insert here @@ -113,23 +306,24 @@ def _maybe_insert_input_observer_for_arg_or_kwarg( assert arg_as_input_act_obs_or_fq is not None if existing_obs_node is None: + maybe_observed_arg = arg + # When quantizing two layers with different configs we can have + # conv2d (int8) -> avgpool(uint8) + # In this case observer insertion for avgpool will come here but the input + # to avgpool will be output observer of conv2d + # Now the obs map that we update must correspond to the original input of + # avgpool and not the output obs of conv2d + # This is because when referring to the edge, quantizer would refer to + # original input and not the observed one. + while _is_activation_post_process_node(arg, named_modules): + arg = arg.args[0] # type: ignore[assignment] + arg_as_input_act_obs_or_fq = obs_or_fq_map[(arg, node)] new_obs_node = _insert_obs_or_fq( - arg, arg_as_input_act_obs_or_fq, model, named_modules, model.graph) + maybe_observed_arg, arg_as_input_act_obs_or_fq, model, named_modules, model.graph) # override this arg to be the observed arg new_arg = new_obs_node else: new_arg = existing_obs_node - # When quantizing two layers with different configs we can have - # conv2d (int8) -> avgpool(uint8) - # In this case observer insertion for avgpool will come here but the input - # to avgpool will be output observer of conv2d - # Now the obs map that we update must correspond to the original input of - # avgpool and not the output obs of conv2d - # This is because when referring to the edge, quantizer would refer to - # original input and not the observed one. - while _is_activation_post_process_node(arg, named_modules): - arg = arg.args[0] # type: ignore[assignment] - obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq return new_arg @@ -172,6 +366,19 @@ def _maybe_insert_input_observers_for_node( # assign the new args to the node, inplace node.args = tuple(new_args) +def _maybe_insert_output_observer_for_node( + node: Node, + model: torch.nn.Module, + named_modules: Dict[str, torch.nn.Module], + graph: Graph, + obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize], + is_qat: bool, +) -> Optional[Node]: + if node in obs_or_fq_map: + output_act_obs_or_fq = obs_or_fq_map[node] + return _insert_obs_or_fq(node, output_act_obs_or_fq, model, named_modules, graph) + return None + def _maybe_insert_input_and_output_observers_for_node( node: Node, model: torch.fx.GraphModule, @@ -213,7 +420,8 @@ def _maybe_insert_input_and_output_observers_for_node( return # this returns the new observer node if it was needed - maybe_output_obs_node = _maybe_insert_output_observer_for_node(node, model, named_modules, model.graph, obs_or_fq_map, is_qat) + maybe_output_obs_node = _maybe_insert_output_observer_for_node( + node, model, named_modules, model.graph, obs_or_fq_map, is_qat) if maybe_output_obs_node is None: return @@ -246,9 +454,17 @@ def prepare( # Since we are mutating the graph as we go, we iterate over the original # nodes before observer insertion, instead of model.graph.nodes. nodes_before_observation = list(model.graph.nodes) - obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {} + + # At the high level we construct a map from EdgeOrNode to a observer_or_fake_quant instance + # all edge/nodes that belongs to the same group will use the same instance + # and when we insert observers we'll just query this map to get the correct observer_or_fake_quant + # instance + edge_or_node_to_qspec = _get_edge_or_node_to_qspec(model) + edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec) + obs_or_fq_map = _get_obs_or_fq_map(edge_or_node_to_group_id, edge_or_node_to_qspec, is_qat) for node in nodes_before_observation: + # TODO: simplify logic for inserting observers _maybe_insert_input_and_output_observers_for_node(node, model, obs_or_fq_map, is_qat) model = GraphModule(model, model.graph) diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py index 765cb9446bfd1..85cd839f46a38 100644 --- a/torch/ao/quantization/quantize_pt2e.py +++ b/torch/ao/quantization/quantize_pt2e.py @@ -231,8 +231,8 @@ def convert_pt2e( model = _convert_to_reference_decomposed_fx(model) model = _fold_conv_bn_qat(model) pm = PassManager([DuplicateDQPass()]) - model = pm(model).graph_module + model = pm(model).graph_module pm = PassManager([PortNodeMetaForQDQ()]) model = pm(model).graph_module diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index b6983a1c25e78..80e069bcfb43f 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -1517,12 +1517,12 @@ Call this whenever a new thread is created in order to propagate values from [](const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias_opt, - at::IntArrayRef stride_, + at::SymIntArrayRef stride_, at::SymIntArrayRef padding_, - at::IntArrayRef dilation_, + at::SymIntArrayRef dilation_, bool transposed_, at::SymIntArrayRef output_padding_, - int64_t groups_) { + c10::SymInt groups_) { return at::native::select_conv_backend( input, weight, @@ -1532,7 +1532,7 @@ Call this whenever a new thread is created in order to propagate values from dilation_, transposed_, output_padding_, - groups_, + std::move(groups_), c10::nullopt); }, py::arg("input"), @@ -1551,12 +1551,12 @@ Call this whenever a new thread is created in order to propagate values from [](const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias, - at::IntArrayRef stride_, + at::SymIntArrayRef stride_, at::SymIntArrayRef padding_, - at::IntArrayRef dilation_, + at::SymIntArrayRef dilation_, bool transposed_, at::SymIntArrayRef output_padding_, - int64_t groups_, + c10::SymInt groups_, c10::optional> bias_sizes_opt) { c10::OptionalArrayRef ref = c10::nullopt; if (bias_sizes_opt) { @@ -1571,7 +1571,7 @@ Call this whenever a new thread is created in order to propagate values from dilation_, transposed_, output_padding_, - groups_, + std::move(groups_), ref); }, py::arg("input"), diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 7b0f14eeeb830..5d212c6ce9a0f 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -1274,12 +1274,12 @@ Tensor convolution_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - IntArrayRef stride, + at::SymIntArrayRef stride, at::SymIntArrayRef padding, - IntArrayRef dilation, + at::SymIntArrayRef dilation, bool transposed, at::SymIntArrayRef output_padding, - int64_t groups) { + const c10::SymInt& groups) { auto bias_t_opt = bias_t.defined() ? c10::optional(bias_t) : c10::nullopt; return ( @@ -1312,12 +1312,12 @@ Tensor _convolution_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - IntArrayRef stride, + at::SymIntArrayRef stride, at::SymIntArrayRef padding, - IntArrayRef dilation, + at::SymIntArrayRef dilation, bool transposed, at::SymIntArrayRef output_padding, - int64_t groups, + const c10::SymInt& groups, bool benchmark, bool deterministic, bool cudnn_enabled, @@ -6670,30 +6670,31 @@ std::tuple _cudnn_convolution_backward( const at::Tensor& self, const at::Tensor& grad_output, const at::Tensor& weight, - at::IntArrayRef padding, - at::IntArrayRef output_padding, - at::IntArrayRef stride, - at::IntArrayRef dilation, + at::SymIntArrayRef padding, + at::SymIntArrayRef output_padding, + at::SymIntArrayRef stride, + at::SymIntArrayRef dilation, bool transposed, - int64_t groups, + c10::SymInt groups, ::std::array output_mask) { if (!grad_output.defined()) { return std::tuple(); } // Just call the general backward and ignore the bias gradient part. - std::tuple grad_inputs = at::convolution_backward( - grad_output, - self, - weight, - c10::nullopt, - stride, - padding, - dilation, - transposed, - output_padding, - groups, - {output_mask[0], output_mask[1], false}); + std::tuple grad_inputs = + at::convolution_backward_symint( + grad_output, + self, + weight, + c10::nullopt, + stride, + padding, + dilation, + transposed, + output_padding, + std::move(groups), + {output_mask[0], output_mask[1], false}); std::tuple result = std::make_tuple(std::get<0>(grad_inputs), std::get<1>(grad_inputs)); return result; diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 571879f7c4d93..3d232a41f468a 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -971,12 +971,12 @@ Tensor convolution_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - IntArrayRef stride, + at::SymIntArrayRef stride, at::SymIntArrayRef padding, - IntArrayRef dilation, + at::SymIntArrayRef dilation, bool transposed, at::SymIntArrayRef output_padding, - int64_t groups); + const c10::SymInt& groups); Tensor _convolution_jvp( const Tensor& input_p, @@ -985,12 +985,12 @@ Tensor _convolution_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - IntArrayRef stride, + at::SymIntArrayRef stride, at::SymIntArrayRef padding, - IntArrayRef dilation, + at::SymIntArrayRef dilation, bool transposed, at::SymIntArrayRef output_padding, - int64_t groups, + const c10::SymInt& groups, bool benchmark, bool deterministic, bool cudnn_enabled, @@ -1023,12 +1023,12 @@ std::tuple _cudnn_convolution_backward( const at::Tensor& self, const at::Tensor& grad_output, const at::Tensor& weight, - at::IntArrayRef padding, - at::IntArrayRef output_padding, - at::IntArrayRef stride, - at::IntArrayRef dilation, + at::SymIntArrayRef padding, + at::SymIntArrayRef output_padding, + at::SymIntArrayRef stride, + at::SymIntArrayRef dilation, bool transposed, - int64_t groups, + c10::SymInt groups, ::std::array output_mask); Tensor scatter_reduce_jvp( diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp index c7f0923752c93..265d9fa5594f0 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.cpp +++ b/torch/csrc/autograd/functions/accumulate_grad.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -83,17 +84,19 @@ variable_list AccumulateGrad::apply_with_saved( at::Tensor grad_copy = variable.grad(); saved.before(variable_copy); saved.before(grad_copy); - accumulateGrad( - variable_copy, - grad_copy, - grads[0], - 0 /* num_expected_refs, 0 disables aliased reuse */, - [&saved, this](const at::Tensor& grad_update) { - saved.assign_mutable_grad(variable, grad_update); - }); + variable_copy.mutable_grad() = grad_copy; + // op is intentionally static + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("inductor::accumulate_grad_", "") + .typed(); + op.call(variable_copy, grads[0]); saved.after(variable_copy); saved.after(grad_copy); + TORCH_CHECK( + tensor_post_acc_grad_hooks() == nullptr, + "compiled_autograd does not support tensor_post_acc_grad_hooks"); + return variable_list(); } diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp index b14517523f521..a016644f4febc 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.cpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp @@ -7,22 +7,20 @@ #include -#include - namespace c10d { ncclComm_t NCCLComm::getNcclComm() { std::unique_lock lock(mutex_); if (aborted_) { auto commFailureMsg = commFailureReason_ != c10::nullopt - ? fmt::format( - " Original reason for failure was: {}", *commFailureReason_) + ? c10::str(" Original reason for failure was: ", *commFailureReason_) : ""; TORCH_CHECK( false, - fmt::format( - "NCCL communicator was aborted on rank {}. {}", + c10::str( + "NCCL communicator was aborted on rank ", rank_, + ". ", commFailureMsg)); } return ncclComm_; diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 45dc9110053fe..231882a9beeee 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -14,6 +13,7 @@ #include #include +#include #include #include #include @@ -288,20 +288,24 @@ const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 1000; constexpr int64_t kSynchronizeBusyWaitMillis = 10; thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0; -std::string getWorkInfo(const ProcessGroupNCCL::WorkNCCL& workNCCL) { - return fmt::format( - "WorkNCCL(SeqNum={}, OpType={}, NumelIn={}, NumelOut={}, Timeout(ms)={})", +std::ostream& operator<<( + std::ostream& output, + const ProcessGroupNCCL::WorkNCCL& workNCCL) { + std::string workInfo; + workInfo = c10::str( + "WorkNCCL(", + "SeqNum=", workNCCL.seq_, + ", OpType=", opTypeToString(workNCCL.opType_), + ", NumelIn=", workNCCL.numelIn_, + ", NumelOut=", workNCCL.numelOut_, - workNCCL.opTimeout_.count()); -} - -std::ostream& operator<<( - std::ostream& output, - const ProcessGroupNCCL::WorkNCCL& workNCCL) { - return output << getWorkInfo(workNCCL); + ", Timeout(ms)=", + workNCCL.opTimeout_.count(), + ")"); + return output << workInfo; } ProcessGroupNCCL::WorkNCCL::WorkNCCL( @@ -447,12 +451,15 @@ bool ProcessGroupNCCL::WorkNCCL::checkTimeout( if (exception()) return true; - std::string exceptionMsg = fmt::format( - "[Rank {}] Watchdog caught collective operation timeout: {} ran for {}" - " milliseconds before timing out.", + std::string exceptionMsg = c10::str( + "[Rank ", rank_, - getWorkInfo(*this), - timeElapsed.count()); + "] ", + "Watchdog caught collective operation timeout: ", + *this, + " ran for ", + timeElapsed.count(), + " milliseconds before timing out."); LOG(ERROR) << exceptionMsg; std::exception_ptr exception_ptr = @@ -514,10 +521,12 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal( // here, it was observed that CUDA GPU will have 100% utilization and // can not run new events successfully. if (timedOut) { - std::string exceptionMsg = fmt::format( - "[Rank {}] Work {} timed out in blocking wait (NCCL_BLOCKING_WAIT=1).", + std::string exceptionMsg = c10::str( + "[Rank ", rank_, - getWorkInfo(*this)); + "] Work ", + (*this), + " timed out in blocking wait (NCCL_BLOCKING_WAIT=1)."); LOG(ERROR) << exceptionMsg; break; } @@ -881,8 +890,7 @@ ProcessGroupNCCL::~ProcessGroupNCCL() { onCompletionHookThread_.join(); // Abort all NCCL Communicators on Process Group Destruction - std::string abortReason = - fmt::format("Process Group destroyed on rank {}", rank_); + std::string abortReason = c10::str("Process Group destroyed on rank ", rank_); abort(abortReason); } @@ -895,17 +903,17 @@ void ProcessGroupNCCL::ncclCommWatchdog() { } catch (std::exception& e) { if (std::string(e.what()).find("driver shutting down") != std::string::npos) { - LOG(INFO) << fmt::format( - "[Rank {}] " - "main process destroyed cuda before watchdog loop exited, terminating watchdog. " - "(Watchdog caught exception: {})", - rank_, - e.what()); + LOG(INFO) + << "[Rank " << rank_ + << "] main process destroyed cuda before watchdog loop exited, terminating watchdog." + << " (Watchdog caught exception: " << e.what(); + } else { // Append error message reported from workCleanupLoop - const auto exitMsg = fmt::format( - "[Rank {}] NCCL watchdog thread terminated with exception: {}", + const auto exitMsg = c10::str( + "[Rank ", rank_, + "] NCCL watchdog thread terminated with exception: ", e.what()); LOG(ERROR) << exitMsg; // TODO(whc) clean up the rethrow - why is it stored in a class var and @@ -914,9 +922,10 @@ void ProcessGroupNCCL::ncclCommWatchdog() { std::rethrow_exception(watchDogException_); } } catch (...) { - const auto exitMsg = fmt::format( - "[Rank {}] NCCL watchdog thread terminated with exception: unknown", - rank_); + const auto exitMsg = c10::str( + "[Rank ", + rank_, + "] NCCL watchdog thread terminated with exception: unknown"); LOG(ERROR) << exitMsg; watchDogException_ = std::make_exception_ptr(std::runtime_error(exitMsg)); std::rethrow_exception(watchDogException_); @@ -1018,6 +1027,7 @@ void ProcessGroupNCCL::workCleanupLoop() { } else { it = workMetaList_.erase(it); } + at::cuda::CUDAGraph::dec_pending_event_queries(); } else { // Increment the iterator if the current WorkNCCL object is not // completed. @@ -1113,8 +1123,8 @@ std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal( // commFailureReason is set. auto commFailureReason = ncclComm->getNcclCommFailureReason(); if (commFailureReason != c10::nullopt) { - return std::make_exception_ptr(std::runtime_error(fmt::format( - "NCCL communicator encountered error set by ProcessGroupNCCL: {}", + return std::make_exception_ptr(std::runtime_error(c10::str( + "NCCL communicator encountered error set by ProcessGroupNCCL: ", *commFailureReason))); } ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError(); @@ -1164,12 +1174,15 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID( TORCH_CHECK(vec.size() == NCCL_UNIQUE_ID_BYTES); std::memcpy(ncclID, vec.data(), vec.size()); } catch (const std::exception& e) { - std::string exceptionMsg = fmt::format( - "[{}] is setting up NCCL communicator and retrieving ncclUniqueId " - "from [0] via c10d key-value store by key '{}', but store->get('{}') got error: ", + std::string exceptionMsg = c10::str( + "[", rank_, + "] is setting up NCCL communicator and " + "retrieving ncclUniqueId from [0] via c10d key-value store by key '", + storeKey, + "', but store->get('", storeKey, - storeKey); + "') got error: "); TORCH_CHECK( false, exceptionMsg + e.what() + @@ -1177,12 +1190,14 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID( } catch (...) { TORCH_CHECK( false, - fmt::format( - "Unknown exception while [{}] is setting up NCCL communicator and " - "retrieving ncclUniqueId from [0] via c10d key-value store by key '{}" - "'. This may indicate a possible application crash on rank 0 or a network set up issue.", + c10::str( + "Unknown exception while [", rank_, - storeKey)); + "] is setting up NCCL communicator and " + "retrieving ncclUniqueId from [0] via c10d key-value store by key '", + storeKey, + "'", + ". This may indicate a possible application crash on rank 0 or a network set up issue.")); } } } @@ -1834,8 +1849,13 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( work->numelIn_ = inputs[0].numel(); work->numelOut_ = outputs[0].numel(); + // Notify graphs before we check the capture status preemptively + at::cuda::CUDAGraph::inc_pending_event_queries(); + if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) { workEnqueue(work); + } else { + at::cuda::CUDAGraph::dec_pending_event_queries(); } return work; @@ -2013,8 +2033,14 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( // Enqueue P2P op so that it can be cancelled by NCCL watchdog c10::cuda::CaptureStatus capture_status = c10::cuda::currentStreamCaptureStatusMayInitCtx(); + + // Notify graphs before we check the capture status preemptively + at::cuda::CUDAGraph::inc_pending_event_queries(); + if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) { workEnqueue(work); + } else { + at::cuda::CUDAGraph::dec_pending_event_queries(); } return work; @@ -2813,13 +2839,14 @@ c10::intrusive_ptr ProcessGroupNCCL::barrier(const BarrierOptions& opts) { // ensure that each process is on a different GPU auto numGPUs = at::cuda::getNumGPUs(); int16_t deviceIdx = static_cast(rank_ % numGPUs); - LOG(INFO) << fmt::format( - "Rank {} using GPU {}" - " to perform barrier as devices used by this process are currently unknown. " - "This can potentially cause a hang if this rank to GPU mapping is incorrect." - "Specify device_ids in barrier() to force use of a particular device.", + LOG(INFO) << c10::str( + "Rank ", this->getRank(), - deviceIdx); + " using GPU ", + deviceIdx, + " to perform barrier as devices used by this process are currently unknown. ", + "This can potentially cause a hang if this rank to GPU mapping is incorrect.", + "Specify device_ids in barrier() to force use of a particular device."); devices.emplace_back(getDeviceForRank(rank_)); } else { for (auto usedDeviceIdx : usedDeviceIdxs_) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index fd0fe5c8e85ce..782b55fec7b8e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -233,7 +233,6 @@ class TORCH_API ProcessGroupNCCL : public Backend { friend std::ostream& operator<<( std::ostream& output, const WorkNCCL& workNCCL); - friend std::string getWorkInfo(const WorkNCCL& workNCCL); private: // Helper function for synchronize diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h index 18c7723ac11c9..f44c2185a9959 100644 --- a/torch/csrc/dynamo/compiled_autograd.h +++ b/torch/csrc/dynamo/compiled_autograd.h @@ -469,7 +469,6 @@ struct TraceState { size_t sym_sizes_index; std::vector> sym_sizes; variable_list outputs; - std::vector output_grad_targets; }; class SwapSavedVariables { @@ -620,17 +619,6 @@ class SwapSavedVariables { NO_OP_VISIT(double); #undef NO_OP_VISIT - // record the need to run `dst.mutable_grad() = src` after the graph - // dst is a real tensor, src is a fake tensor - void assign_mutable_grad(const at::Tensor& dst, const at::Tensor& src) { - const TensorArg& arg = compiler.tensor_args.lookup(dst); - TORCH_INTERNAL_ASSERT(arg.defined()); - TORCH_INTERNAL_ASSERT( - state.outputs.size() == state.output_grad_targets.size()); - state.outputs.emplace_back(src); - state.output_grad_targets.emplace_back(arg.index()); - } - SwapSavedVariables(AutogradCompilerCall& c, TraceState& s) : compiler(c), state(s) {} diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp index e5c6b51238f10..515115e0a49f6 100644 --- a/torch/csrc/dynamo/python_compiled_autograd.cpp +++ b/torch/csrc/dynamo/python_compiled_autograd.cpp @@ -104,7 +104,6 @@ struct CacheNode { next.clear(); key_storage.clear(); expected_sizes.clear(); - output_grad_targets.clear(); compiled_fn = nullptr; } @@ -208,9 +207,6 @@ struct CacheNode { std::vector expected_sizes; THPObjectPtr compiled_fn; - // Maps each return value of compiled_fn to an input index. After the graph - // runs we do: `inputs[output_grad_targets[i]].mutable_grad() = outputs[i]` - std::vector output_grad_targets; }; struct InputBuffers : public std::unordered_map { @@ -463,7 +459,6 @@ variable_list compiled_autograd( } cache->compiled_fn = check(call_end_capture(py_compiler, state.outputs)); - cache->output_grad_targets = std::move(state.output_grad_targets); state.debug_asserts(); } // End cache miss region @@ -482,21 +477,8 @@ variable_list compiled_autograd( THPObjectPtr pyresult(check(PyObject_CallFunctionObjArgs( cache->compiled_fn.get(), inputs.get(), sizes.get(), hooks.get(), NULL))); variable_list outputs = THPVariable_UnpackList(pyresult); - if (accumulate_grad) { - TORCH_INTERNAL_ASSERT(outputs.size() == cache->output_grad_targets.size()); - for (const auto i : c10::irange(outputs.size())) { - // Here we set the `var.grad = ...` for each call to - // `saved.assign_mutable_grad(var, ...)`. For the case on inplace grad - // accumuation there will be an `add_` op in the graph and no return - // value. - compiler_call.tensor_args.inputs[cache->output_grad_targets[i]] - .mutable_grad() = outputs[i]; - } - return variable_list(); - } else { - TORCH_INTERNAL_ASSERT(outputs.size() == output_edges.size()); - return outputs; - } + TORCH_INTERNAL_ASSERT(outputs.size() == output_edges.size()); + return outputs; } static PyObject* set_autograd_compiler(PyObject* dummy, PyObject* args) { diff --git a/torch/csrc/inductor/inductor_ops.cpp b/torch/csrc/inductor/inductor_ops.cpp index 7f72773956cfd..7dc48f0702098 100644 --- a/torch/csrc/inductor/inductor_ops.cpp +++ b/torch/csrc/inductor/inductor_ops.cpp @@ -4,6 +4,7 @@ #include #endif +#include #include #include @@ -38,13 +39,34 @@ Tensor _reinterpret_tensor( return self_; } +static void accumulate_grad_(const Tensor& variable, const Tensor& new_grad) { + at::Tensor& grad = variable.mutable_grad(); + if (new_grad.device() != kMeta) { + torch::autograd::AccumulateGrad::accumulateGrad( + variable, + grad, + new_grad, + 1 /* num_expected_refs */, + [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); }); + } else { + // no shape checking for `device="meta"` to workaround FSDP inplace mutation + if (!grad.defined()) { + grad = new_grad; + } + } +} + TORCH_LIBRARY_FRAGMENT(inductor, m) { m.def( "_mm_plus_mm(Tensor a, Tensor b, Tensor c, Tensor d, Tensor(t!) out) -> Tensor(t!)", - _mm_plus_mm); + dispatch(c10::DispatchKey::CompositeExplicitAutograd, _mm_plus_mm)); m.def( "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor", - _reinterpret_tensor); + dispatch( + c10::DispatchKey::CompositeExplicitAutograd, _reinterpret_tensor)); + m.def( + "accumulate_grad_(Tensor variable, Tensor new_grad) -> ()", + dispatch(c10::DispatchKey::CompositeExplicitAutograd, accumulate_grad_)); } } // namespace inductor diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 4dc1710923d6b..762d7b9f3305f 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -1455,6 +1455,7 @@ void initJITBindings(PyObject* module) { PyObject_CallMethod(buffer_.ptr(), "readinto", "O", memview.get()); if (res) { int64_t i = static_cast(PyLong_AsLongLong(res)); + Py_DECREF(res); if (i > 0) { return i; } diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index c5526ab58c5dc..abf1371e290a1 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -629,6 +629,11 @@ inline std::vector PythonArgs::intlistWithDefault( if (size1 > 0 && THPUtils_checkLong(arg)) { return std::vector(size1, THPUtils_unpackLong(arg)); } + if (size1 > 0 && torch::is_symint(py::handle(arg))) { + return std::vector( + size1, + py::handle(arg).cast().guard_int(__FILE__, __LINE__)); + } auto tuple = PyTuple_Check(arg); // NOLINTNEXTLINE(bugprone-branch-clone) const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg); diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py index 950b3d2b87d5e..9ca0ee0eba743 100644 --- a/torch/distributed/_shard/sharded_tensor/utils.py +++ b/torch/distributed/_shard/sharded_tensor/utils.py @@ -109,7 +109,7 @@ def build_metadata_from_local_shards( first_shard_is_pinned = local_shards[0].tensor.is_pinned() # 1). Validate local tensors and associated metadatas - for i, local_shard in enumerate(local_shards): + for local_shard in local_shards: local_shard_tensor = local_shard.tensor local_shard_meta = local_shard.metadata local_shard_metadatas.append(local_shard_meta) diff --git a/torch/distributed/_tensor/_utils.py b/torch/distributed/_tensor/_utils.py index 592cb6bd6e25e..9aac9e01d4732 100644 --- a/torch/distributed/_tensor/_utils.py +++ b/torch/distributed/_tensor/_utils.py @@ -143,7 +143,12 @@ def compute_global_tensor_info( for idx, placement in enumerate(placements): mesh_dim_size = mesh.size(idx) if placement.is_shard(): - shard_dim = cast(Shard, placement).dim + shard_placement = cast(Shard, placement) + if shard_placement.dim < 0: + # normalize shard dim to be positive + shard_placement.dim += len(tensor_shape) + shard_dim = shard_placement.dim + local_dim_size = tensor_shape[shard_dim] tensor_shape[shard_dim] = local_dim_size * mesh_dim_size diff --git a/torch/distributed/_tensor/_xla.py b/torch/distributed/_tensor/_xla.py new file mode 100644 index 0000000000000..80d93c7652fae --- /dev/null +++ b/torch/distributed/_tensor/_xla.py @@ -0,0 +1,200 @@ +import logging +import os +from functools import wraps +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union + +import torch + +import torch.nn as nn +from torch.distributed._tensor.device_mesh import DeviceMesh +from torch.distributed._tensor.placement_types import Placement, Replicate + +log = logging.getLogger(__name__) + +TORCH_XLA_INITIALIZED = False +try: + import torch_xla.core.xla_model as xm # type:ignore[import] # noqa: F401 + import torch_xla.runtime as xr # type:ignore[import] + from torch_xla.experimental.xla_sharded_tensor import ( # type:ignore[import] + XLAShardedTensor, + ) + from torch_xla.experimental.xla_sharding import ( # type:ignore[import] + mark_sharding, + Mesh, + ShardingType, + ) + + TORCH_XLA_INITIALIZED = True +except ImportError as e: + log.warning(e.msg) + + +# wrapper to check xla test requirements +def with_xla(func: Callable) -> Callable: + assert func is not None + + @wraps(func) # pyre-ignore[6] + def wrapper( + self, *args: Tuple[object], **kwargs: Dict[str, Any] # type: ignore[misc] + ) -> None: + if TORCH_XLA_INITIALIZED: + # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag. + os.environ["XLA_USE_SPMD"] = "1" + return func(self, *args, **kwargs) # type: ignore[misc] + else: + raise ImportError( + "torch.distributed._tensor._xla API requires torch_xla package installation." + ) + + return wrapper + + +@with_xla +def convert_to_xla_mesh(dt_mesh: DeviceMesh) -> "Mesh": + """ + Convert DTensor `dt_mesh` to XLAShardedTensor `partition_spec`. + + Example (1x4 logical device mesh topology): + ``` + dt_mesh = DeviceMesh("xla", [[1, 2, 3, 4]]) + dt_mesh.mesh.shape + >> torch.Size([1, 4]) + + mesh = convert_to_xla_mesh(dt_mesh) + mesh_shape + >> [1, 4] + ``` + """ + assert dt_mesh.size() == xr.global_runtime_device_count() + return Mesh( + dt_mesh.mesh.flatten(), tuple(dt_mesh.mesh.size()), dt_mesh.mesh_dim_names + ) + + +@with_xla +def convert_to_xla_partition_spec( + tensor: torch.Tensor, placements: Sequence[Placement] +) -> Tuple[Union[Tuple, int, None]]: + """ + Convert DTensor `placements` to XLAShardedTensor `partitoin_spec`. + This supports Shard and Replicate Placement types. + + Example: + ``` + # Mesh partitioning, 1/4-th of the input with replicated overlaps. + # The first input tensor dimension is sharded across the second mesh + # dimension, and the rest is replicated over the first mesh dimension. + t = torch.randn(4, 8, 8) + dt_mesh = DeviceMesh("xla", torch.arange(8).reshape(2,4)) + placements = [Replicate(), Shard(0)] + my_dtensor = distribute_tensor(t, dt_mesh, placements) + + # `placements = [Replicate(), Shard(0)]` describes sharding per mesh dim, + # and this is equivalent to `partition_spec = (1, None, None)` which is + # sharding per input tensor dimension. + partition_spec = convert_to_xla_partition_spec(t, placements) + >> (1, None, None) + ``` + """ + # per tensor dimension sharding + sharding_spec = [None] * len(tensor.shape) + for mesh_idx, spec in enumerate(placements): + if spec.is_shard(): # type:ignore[truthy-function] + # mesh_idx to tensor_idx (spec.dim) + tensor_idx = spec.dim # type:ignore[attr-defined] + sharding_spec[tensor_idx] = mesh_idx # type:ignore[call-overload] + elif spec.is_replicate(): + # spec.dim is already set to None by default + continue + else: + raise ValueError(f"Unsupported placement type: {type(spec).__name__}") + return tuple(sharding_spec) # type:ignore[return-value] + + +@with_xla +def xla_distribute_tensor( + tensor: torch.Tensor, + device_mesh: DeviceMesh, + placements: Optional[Sequence[Placement]] = None, +) -> "XLAShardedTensor": + """ + Distribute a torch.Tensor to the `device_mesh` according to the `placements` + specified. The rank of `device_mesh` and `placements` must be the same. + + Args: + tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you + want to shard a tensor on a dimension that is not evenly divisible by + the number of devices in that mesh dimension, we use `torch.chunk` + semantic to shard the tensor and scatter the shards. + device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the + tensor, if not specified, must be called under a DeviceMesh context + manager, default: None + placements (List[:class:`Placement`], optional): the placements that + describes how to place the tensor on DeviceMesh, must have the same + number of elements as `device_mesh.ndim`. If not specified, we will + by default replicate the tensor across the `device_mesh` from the + first rank of each dimension of the `device_mesh`. + + Returns: + A :class:`XLAShardedTensor` object + + .. note:: We return a XLAShardedTensor with a global view and access to local shards. + The successive ops would be programmed as if on a single-device and without calling + any explicit collective ops. The actual sharded computation on the sharding annotated tensor + happens lazily, is transparent to the user. In the future, we will introduce + a new DTensor type for this kind of programming-mode (single-controller) and return. + """ + # device_mesh is not optional in xla_distribute_tensor + dt_mesh = device_mesh + assert dt_mesh.device_type == "xla" + + # convert to XLA device mesh + xla_mesh = convert_to_xla_mesh(dt_mesh) + assert xla_mesh.mesh_shape == tuple(dt_mesh.mesh.size()) + + # convert tensor to the corresponding device type if it's not in that device type + if not tensor.is_meta: + tensor = tensor.to(dt_mesh.device_type) + # set default placements to replicated if not specified + if placements is None: + placements = [Replicate() for _ in range(dt_mesh.ndim)] + assert ( + len(placements) == dt_mesh.ndim + ), "`placements` must have the same length as `device_mesh.ndim`! " + f"Found placements length: {len(placements)}, and device_mesh.ndim: {dt_mesh.ndim}." + # convert placements to xla partition spec + partition_spec = convert_to_xla_partition_spec(tensor, placements) + assert len(tensor.shape) == len( + partition_spec + ), "`partition_spec` from `placements` must have the same length as `tensor.length`! " + f"Found tensor shape length: {len(tensor.shape)}, and partition_spec length: {len(partition_spec)}." + + global_tensor = tensor + if type(tensor).__name__ == "DTensor": + raise ValueError( + "Cannot distribute a DTensor with local tensor on xla devices." + "The input tensor must be global." + ) + if type(tensor).__name__ == "XLAShardedTensor": + sharding_type = tensor.sharding_type # type:ignore[attr-defined] + assert ( + sharding_type is None or sharding_type == ShardingType.REPLICATED + ), "XLAShardedTensor `tensor` is already annotated with non-replication sharding. " + "Clear the existing sharding annotation first, by callling torch_xla.experimental.xla_sharding.clear_sharding API." + global_tensor = tensor.global_tensor # type:ignore[attr-defined] + assert global_tensor is not None, "distributing a tensor should not be None" + + # Annotates sharding and returns an XLAShardedTensor + xla_tensor = mark_sharding(global_tensor, xla_mesh, partition_spec) + return xla_tensor + + +@with_xla +def xla_distribute_module( + module: nn.Module, + device_mesh: Optional[DeviceMesh] = None, + partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None, + input_fn: Optional[Callable[..., None]] = None, + output_fn: Optional[Callable[..., None]] = None, +) -> nn.Module: + raise NotImplementedError diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py index 26d1d52c2c61a..e2bd0d60ad7e1 100644 --- a/torch/distributed/_tensor/api.py +++ b/torch/distributed/_tensor/api.py @@ -237,7 +237,6 @@ def __tensor_unflatten__(inner_tensors, flatten_spec): assert ( flatten_spec is not None ), "Expecting spec to be not None from `__tensor_flatten__` return value!" - assert isinstance(inner_tensors, dict) and len(inner_tensors) == 1 local_tensor = inner_tensors["_local_tensor"] spec, requires_grad = flatten_spec return DTensor( @@ -394,15 +393,18 @@ def redistribute( if placements is None: raise RuntimeError("placements is needed for redistribute!") - # Early return the original DTensor if the placements are the same. - if self._spec.placements == placements: - return self - for placement in placements: if placement.is_partial(): raise RuntimeError( "Can not redistribute to _Partial, _Partial is for internal use only!" ) + elif isinstance(placement, Shard) and placement.dim < 0: + # normalize shard dim to be positive + placement.dim += self.ndim + + # Early return the original DTensor if the placements are the same. + if self._spec.placements == placements: + return self # pyre-fixme[16]: `Redistribute` has no attribute `apply`. return Redistribute.apply(self, device_mesh, placements) @@ -451,7 +453,12 @@ def distribute_tensor( first rank of each dimension of the `device_mesh`. Returns: - A :class:`DTensor` object + A :class:`DTensor` or `XLAShardedTensor` object. + + Note: + When initialize the DeviceMesh with the `xla` device_type, `distribute_tensor` + return `XLAShardedTensor` instead. see [link](https://github.com/pytorch/pytorch/issues/92909) + for more details. The XLA integration is experimental and subject to change. """ torch._C._log_api_usage_once("torch.dtensor.distribute_tensor") @@ -459,6 +466,14 @@ def distribute_tensor( # get default device mesh if there's nothing specified device_mesh = device_mesh or mesh_resources.get_current_mesh() device_type = device_mesh.device_type + if device_type == "xla": + # call PyTorch/XLA SPMD for `xla` backend type device mesh. + # This returns XLAShardedTensor + from torch.distributed._tensor._xla import xla_distribute_tensor + + return xla_distribute_tensor( + tensor, device_mesh, placements + ) # type:ignore[return-value] # instantiate a RNG tracker if haven't. By default DTensor uses an # OffsetBasedRNGTracker to perform random operators. @@ -485,7 +500,6 @@ def distribute_tensor( f"`placements` must have the same length as `device_mesh.ndim`! " f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}." ) - if isinstance(tensor, DTensor): # if the tensor is already a DTensor, we just need to check if the # device mesh and placements are the same @@ -508,6 +522,9 @@ def distribute_tensor( for idx, placement in enumerate(placements): if placement.is_shard(): placement = cast(Shard, placement) + if placement.dim < 0: + # normalize shard placement dim + placement.dim += tensor.ndim local_tensor = placement._shard_tensor(local_tensor, device_mesh, idx) elif placement.is_replicate(): placement = cast(Replicate, placement) diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py index 614d560119e7c..e023d4dfccb33 100644 --- a/torch/distributed/_tensor/device_mesh.py +++ b/torch/distributed/_tensor/device_mesh.py @@ -174,12 +174,16 @@ def __init__( # private field to pre-generate DeviceMesh's hash self._flatten_mesh_list = tuple(self.mesh.flatten().tolist()) self._hash = hash((self._flatten_mesh_list, self.mesh.shape)) - # always try to create default (world) pg, even if it is not initialized - # already. The world pg is used for device mesh identity (rank) on each - # process (we need to know if the current global rank is in the mesh or not) - self._get_or_create_default_group() - if _init_process_groups: - self._init_process_groups(_validate_mesh) + + # Skip process group initialization if xla device. + # TODO(yeounoh) implement DeviceMesh backend and register XLA backend. + if device_type != "xla": + # always try to create default (world) pg, even if it is not initialized + # already. The world pg is used for device mesh identity (rank) on each + # process (we need to know if the current global rank is in the mesh or not). + self._get_or_create_default_group() + if _init_process_groups: + self._init_process_groups(_validate_mesh) def _get_or_create_default_group(self): default_initialized = is_initialized() diff --git a/torch/distributed/_tensor/sharding_prop.py b/torch/distributed/_tensor/sharding_prop.py index a73c877525d29..5f472915f9fde 100644 --- a/torch/distributed/_tensor/sharding_prop.py +++ b/torch/distributed/_tensor/sharding_prop.py @@ -88,7 +88,7 @@ def _propagate_tensor_meta(self, op_schema: OpSchema) -> object: elif isinstance(fake_out, (tuple, list)): tensor_meta_list = [] - for i, fake_out_item in enumerate(fake_out): + for fake_out_item in fake_out: if isinstance(fake_out_item, torch.Tensor): tensor_meta_list.append( TensorMeta( diff --git a/torch/fx/node.py b/torch/fx/node.py index e400ceeb165aa..075040dc308bf 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -39,7 +39,9 @@ _ops.aten.sym_constrain_range_for_size.default, _ops.profiler._record_function_enter, _ops.profiler._record_function_enter_new, - _ops.profiler._record_function_exit} + _ops.profiler._record_function_exit, + _ops.inductor.accumulate_grad_.default, +} @compatibility(is_backward_compatible=False) diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 15de3bb55e52c..496cd16523b19 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -2571,6 +2571,20 @@ def forward(self, x, y): z = torch.cat([x, y], dim=1) return z + class Conv2dWithTwoCat(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 3, 3) + self.conv2 = torch.nn.Conv2d(3, 3, 3) + + def forward(self, x1, x2, x3, x4): + x1 = self.conv1(x1) + x2 = self.conv2(x2) + y = torch.cat([x1, x2], dim=1) + z = x3 + x4 + w = torch.cat([z, y]) + return w + class EmbeddingModule(torch.nn.Module): def __init__(self): super().__init__() diff --git a/torch/testing/_internal/optests/aot_autograd.py b/torch/testing/_internal/optests/aot_autograd.py index ed99640a6a8aa..4d2897c07385a 100644 --- a/torch/testing/_internal/optests/aot_autograd.py +++ b/torch/testing/_internal/optests/aot_autograd.py @@ -52,7 +52,7 @@ def aot_autograd_check( def func_no_tensors(args): reconstructed_flat_args = [] args = iter(args) - for idx, v in enumerate(flat_args): + for v in flat_args: if isinstance(v, torch.Tensor): reconstructed_flat_args.append(next(args)) else: diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py index ee99964184f2b..b101c38f4ccdb 100644 --- a/torch/utils/benchmark/utils/timer.py +++ b/torch/utils/benchmark/utils/timer.py @@ -328,36 +328,6 @@ def _estimate_block_size(self, min_run_time: float) -> int: number *= 10 return number - def adaptive_autorange( - self, - threshold: float = 0.1, - *, - min_run_time: float = 0.01, - max_run_time: float = 10.0, - callback: Optional[Callable[[int, float], NoReturn]] = None, - ) -> common.Measurement: - number = self._estimate_block_size(min_run_time=0.05) - - def time_hook() -> float: - return self._timeit(number) - - def stop_hook(times: List[float]) -> bool: - if len(times) > 3: - return common.Measurement( - number_per_run=number, - raw_times=times, - task_spec=self._task_spec - ).meets_confidence(threshold=threshold) - return False - times = self._threaded_measurement_loop( - number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback) - - return common.Measurement( - number_per_run=number, - raw_times=times, - task_spec=self._task_spec - ) - def blocked_autorange( self, callback: Optional[Callable[[int, float], NoReturn]] = None, @@ -418,6 +388,69 @@ def stop_hook(times: List[float]) -> bool: task_spec=self._task_spec ) + def adaptive_autorange( + self, + threshold: float = 0.1, + *, + min_run_time: float = 0.01, + max_run_time: float = 10.0, + callback: Optional[Callable[[int, float], NoReturn]] = None, + ) -> common.Measurement: + """Similar to `blocked_autorange` but also checks for variablility in measurements + and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached. + + + At a high level, adaptive_autorange executes the following pseudo-code:: + + `setup` + + times = [] + while times.sum < max_run_time + start = timer() + for _ in range(block_size): + `stmt` + times.append(timer() - start) + + enough_data = len(times)>3 and times.sum > min_run_time + small_iqr=times.iqr/times.mean float: + return self._timeit(number) + + def stop_hook(times: List[float]) -> bool: + if len(times) > 3: + return common.Measurement( + number_per_run=number, + raw_times=times, + task_spec=self._task_spec + ).meets_confidence(threshold=threshold) + return False + times = self._threaded_measurement_loop( + number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback) + + return common.Measurement( + number_per_run=number, + raw_times=times, + task_spec=self._task_spec + ) + @overload def collect_callgrind( self, diff --git a/torchgen/api/unboxing.py b/torchgen/api/unboxing.py index 0a3aad42864ed..df4430c49b745 100644 --- a/torchgen/api/unboxing.py +++ b/torchgen/api/unboxing.py @@ -114,7 +114,7 @@ def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]: for i in range(len(args)) ] + [""] binding_list = [] - for i, arg in enumerate(args): + for arg in args: # expecting only Argument if not isinstance(arg.argument, Argument): raise Exception(