Update on "[inductor] Implement clone removal for user defined triton…

… kernel via reinplace_scatters" cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler [ghstack-poisoned]
pytorch · Oct 22, 2023 · e5236b0 · e5236b0
2 parents e8095fb + 5b3a4ae
commit e5236b0
Show file tree

Hide file tree

Showing 110 changed files with 2,221 additions and 824 deletions.
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-e3fb8c09b1ae675832424d7bf5f3484f697efd39
+68161e98aaeaeca02166063d19de92e81ea00c3b
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
@@ -85,11 +85,13 @@ def rebase_ghstack_onto(
 ) -> None:
     if (
         subprocess.run(
-            [sys.executable, "-m", "ghstack", "--help"], capture_output=True
+            [sys.executable, "-m", "ghstack", "--help"],
+            capture_output=True,
+            check=False,
         ).returncode
         != 0
     ):
-        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"])
+        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True)
     orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}"
 
     repo.fetch(orig_ref, orig_ref)
@@ -116,7 +118,7 @@ def rebase_ghstack_onto(
     if dry_run:
         print("Don't know how to dry-run ghstack")
     else:
-        ghstack_result = subprocess.run(["ghstack"], capture_output=True)
+        ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True)
         push_result = ghstack_result.stdout.decode("utf-8")
         print(push_result)
         if ghstack_result.returncode != 0:

diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
@@ -30,6 +30,8 @@ jobs:
         run: |
           tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
           tag_or_branch="${tag_or_branch#refs/heads/}"
+          # replace directory separators with _ in branch name
+          tag_or_branch="${tag_or_branch//\//_}"
           echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
           echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
       - name: Create source distribution

diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
@@ -1,13 +1,10 @@
 #include <ATen/Context.h>
-#include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
 
 
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdarg.h>
 #include <cstdlib>
 #include <stdexcept>
 #include <typeinfo>

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
@@ -1,6 +1,5 @@
 #include <ATen/autocast_mode.h>
 
-#include <exception>
 #include <mutex>
 #include <ATen/CachedTensorUtils.h>
 #include <c10/util/flat_hash_map.h>

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
@@ -57,6 +57,7 @@ struct FormatGuard {
     out.copyfmt(saved);
   }
 private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   std::ostream & out;
   std::ios saved;
 };
@@ -65,7 +66,7 @@ std::ostream& operator<<(std::ostream & out, const DeprecatedTypeProperties& t)
   return out << t.toString();
 }
 
-static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Tensor& self) {
+static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor& self) {
   auto size = self.numel();
   if(size == 0) {
     return std::make_tuple(1., 0);
@@ -116,13 +117,13 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
     }
   }
   double scale = 1;
-  int64_t sz = 11;
+  int sz = 11;
   if(intMode) {
     if(expMax > 9) {
       sz = 11;
       stream << std::scientific << std::setprecision(4);
     } else {
-      sz = expMax + 1;
+      sz = static_cast<int>(expMax) + 1;
       stream << defaultfloat;
     }
   } else {
@@ -141,7 +142,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
         if(expMax == 0) {
           sz = 7;
         } else {
-          sz = expMax+6;
+          sz = static_cast<int>(expMax) + 6;
         }
         stream << std::fixed << std::setprecision(4);
       }
@@ -163,9 +164,7 @@ static void printScale(std::ostream & stream, double scale) {
 }
 static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
 {
-  double scale = 0.0;
-  int64_t sz = 0;
-  std::tie(scale, sz) = __printFormat(stream, self);
+  auto [scale, sz] = __printFormat(stream, self);
 
   __printIndent(stream, indent);
   int64_t nColumnPerLine = (linesize-indent)/(sz+1);
@@ -284,9 +283,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
       stream << "[ " << tensor_.toString() << "{}";
     } else if(tensor.ndimension() == 1) {
       if (tensor.numel() > 0) {
-        double scale = 0.0;
-        int64_t sz = 0;
-        std::tie(scale, sz) =  __printFormat(stream, tensor);
+        auto [scale, sz] = __printFormat(stream, tensor);
         if(scale != 1) {
           printScale(stream, scale);
         }

diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
@@ -3,6 +3,7 @@
 
 using namespace c10;
 
+// NOLINTBEGIN(performance-move-const-arg)
 TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
     List<string> list;
     EXPECT_TRUE(list.empty());
@@ -1159,3 +1160,4 @@ TEST(ListTest, toTypedList) {
   genericList = impl::toList(std::move(stringList));
   EXPECT_THROW(c10::impl::toTypedList<int64_t>(std::move(genericList)), c10::Error);
 }
+// NOLINTEND(performance-move-const-arg)
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
@@ -2,7 +2,6 @@
 #include <ATen/core/NamedTensor.h>
 
 #include <ATen/core/TensorBase.h>
-#include <c10/util/C++17.h>
 
 namespace at {
 

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -1210,7 +1210,7 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAny(
         TypePtr typePtr,
         std::vector<c10::Device> devices)
         : srcFutures(srcs),
-          dstFuture(make_intrusive<ivalue::Future>(typePtr, std::move(devices))) {}
+          dstFuture(make_intrusive<ivalue::Future>(std::move(typePtr), std::move(devices))) {}
     std::atomic<bool> done{false};
     List<intrusive_ptr<ivalue::Future>> srcFutures;
     intrusive_ptr<ivalue::Future> dstFuture;

diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
@@ -44,7 +44,7 @@ namespace {
 
 CppFunction::CppFunction(c10::KernelFunction func, c10::optional<c10::impl::CppSignature> cpp_signature, std::unique_ptr<c10::FunctionSchema> schema)
   : func_(std::move(func))
-  , cpp_signature_(std::move(cpp_signature))
+  , cpp_signature_(cpp_signature)
   , schema_(std::move(schema))
   , debug_()
   {}
@@ -172,7 +172,7 @@ Library& Library::_def(std::variant<c10::OperatorName, c10::FunctionSchema>&& na
       std::move(name),
       dispatch_key,
       std::move(f.func_),
-      std::move(f.cpp_signature_),
+      f.cpp_signature_,
       std::move(f.schema_),
       debugString(std::move(f.debug_), file_, line_)
     )
@@ -223,7 +223,7 @@ Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify
           std::move(name),
           dispatch_key,
           std::move(f.func_),
-          std::move(f.cpp_signature_),
+          f.cpp_signature_,
           std::move(f.schema_),
           debugString(std::move(f.debug_), file_, line_)
         )

diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -5,9 +5,13 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 
+#include <chrono>
+#include <thread>
+
 namespace at::cuda {
 
 static bool _cuda_graphs_debug = false;
+constexpr int kSynchronizeBusyWaitMillis = 10;
 
 MempoolId_t graph_pool_handle() {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
@@ -55,6 +59,25 @@ CaptureId_t capture_sequence_id() {
  * describes memory management for captures.
  */
 
+std::atomic<int> CUDAGraph::pending_event_queries = 0;
+
+// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
+// can be resolved before the capture begins. Note that event queries are not allowed during a
+// graph capture in the default capture mode.
+void CUDAGraph::inc_pending_event_queries() {
+  pending_event_queries++;
+}
+
+void CUDAGraph::dec_pending_event_queries() {
+  TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
+    "Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
+  pending_event_queries--;
+}
+
+int CUDAGraph::num_pending_event_queries() {
+  return pending_event_queries;
+}
+
 CUDAGraph::CUDAGraph()
   // CUDAStreams may not be default-constructed.
   : capture_stream_(at::cuda::getCurrentCUDAStream()) {
@@ -115,6 +138,15 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
   // due to the capture status being updated _after_ a capture had already started.
   c10::cuda::CUDACachingAllocator::beginAllocateStreamToPool(capture_dev_, capture_stream_, mempool_id_);
 
+  // At this point, any NCCL watchdogs should be aware that we are in capture mode
+  // and therefore should not enqueue any additional work that could be event-queried.
+  // We still must wait on any existing work that has not been cleaned up.
+  while (num_pending_event_queries()) {
+    TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
+    std::this_thread::sleep_for(
+      std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+  }
+
   // cudaStreamCaptureModeGlobal is the most conservative option to
   // prevent potentially unsafe CUDA API calls during capture.  See
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85

diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
@@ -5,6 +5,8 @@
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 
+#include <mutex>
+
 namespace at {
 
 struct CUDAGeneratorImpl;
@@ -19,6 +21,9 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   CUDAGraph();
   ~CUDAGraph();
 
+  static void inc_pending_event_queries();
+  static void dec_pending_event_queries();
+  static int num_pending_event_queries();
   void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
   void capture_end();
   void replay();
@@ -33,6 +38,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   cudaGraphExec_t graph_exec_ = NULL;
 #endif
 
+  static std::atomic<int> pending_event_queries;
+
   // internal states so reset() can do its best cleaning up
   // Set to true in capture_end if cudaStreamEndCapture succeeded
   // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate

diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -10,6 +10,7 @@
 #include <cuda_runtime_api.h>
 #include <stdint.h>
 #include <deque>
+#include <future>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -397,7 +398,7 @@ class CUDAHostAllocator {
         (void*)devptr == (void*)ptr,
         "Host and device pointer dont match with cudaHostRegister. "
         "Please dont use this feature by setting "
-        "PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
+        "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
         "");
   }
 
@@ -412,18 +413,31 @@ class CUDAHostAllocator {
     size_t numMapThreads = c10::cuda::CUDACachingAllocator::
         CUDAAllocatorConfig::pinned_num_register_threads();
     if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
+      // parallelize the mapping of pages with a threadpool
       auto* pool = getThreadPool();
+      std::vector<std::promise<void>> promises;
+      std::vector<std::future<void>> futures;
+      promises.reserve(numMapThreads);
+      futures.reserve(numMapThreads);
+
       for (size_t i = 0; i < numMapThreads; i++) {
-        pool->run(std::bind(
-            &CUDAHostAllocator::mapPagesForRegister,
-            this,
+        promises.emplace_back();
+        futures.push_back(promises[i].get_future());
+        auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
+          mapPagesForRegister(
             *ptr,
             roundSize,
             i, // thread task-id
             numMapThreads,
-            pageSize));
+            pageSize);
+          // set the promise when mapping pages are done
+          promises[i].set_value();
+        };
+        pool->run(task);
+      }
+      for (auto& future : futures) {
+        future.wait();
       }
-      pool->waitWorkComplete();
     } else {
       // Map pages in the same thread
       mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);