Skip to content

Commit

Permalink
Update on "[inductor] Implement clone removal for user defined triton…
Browse files Browse the repository at this point in the history
… kernel via reinplace_scatters"

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler

[ghstack-poisoned]
  • Loading branch information
oulgen committed Oct 22, 2023
2 parents e8095fb + 5b3a4ae commit e5236b0
Show file tree
Hide file tree
Showing 110 changed files with 2,221 additions and 824 deletions.
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
e3fb8c09b1ae675832424d7bf5f3484f697efd39
68161e98aaeaeca02166063d19de92e81ea00c3b
8 changes: 5 additions & 3 deletions .github/scripts/tryrebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,13 @@ def rebase_ghstack_onto(
) -> None:
if (
subprocess.run(
[sys.executable, "-m", "ghstack", "--help"], capture_output=True
[sys.executable, "-m", "ghstack", "--help"],
capture_output=True,
check=False,
).returncode
!= 0
):
subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"])
subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True)
orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}"

repo.fetch(orig_ref, orig_ref)
Expand All @@ -116,7 +118,7 @@ def rebase_ghstack_onto(
if dry_run:
print("Don't know how to dry-run ghstack")
else:
ghstack_result = subprocess.run(["ghstack"], capture_output=True)
ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True)
push_result = ghstack_result.stdout.decode("utf-8")
print(push_result)
if ghstack_result.returncode != 0:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/create_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
run: |
tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
tag_or_branch="${tag_or_branch#refs/heads/}"
# replace directory separators with _ in branch name
tag_or_branch="${tag_or_branch//\//_}"
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
- name: Create source distribution
Expand Down
3 changes: 0 additions & 3 deletions aten/src/ATen/Utils.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
#include <ATen/Context.h>
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/Dispatch.h>
#include <ATen/Functions.h>
#include <ATen/Utils.h>
#include <c10/util/accumulate.h>


// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stdarg.h>
#include <cstdlib>
#include <stdexcept>
#include <typeinfo>
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/autocast_mode.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <ATen/autocast_mode.h>

#include <exception>
#include <mutex>
#include <ATen/CachedTensorUtils.h>
#include <c10/util/flat_hash_map.h>
Expand Down
17 changes: 7 additions & 10 deletions aten/src/ATen/core/Formatting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct FormatGuard {
out.copyfmt(saved);
}
private:
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
std::ostream & out;
std::ios saved;
};
Expand All @@ -65,7 +66,7 @@ std::ostream& operator<<(std::ostream & out, const DeprecatedTypeProperties& t)
return out << t.toString();
}

static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Tensor& self) {
static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor& self) {
auto size = self.numel();
if(size == 0) {
return std::make_tuple(1., 0);
Expand Down Expand Up @@ -116,13 +117,13 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
}
}
double scale = 1;
int64_t sz = 11;
int sz = 11;
if(intMode) {
if(expMax > 9) {
sz = 11;
stream << std::scientific << std::setprecision(4);
} else {
sz = expMax + 1;
sz = static_cast<int>(expMax) + 1;
stream << defaultfloat;
}
} else {
Expand All @@ -141,7 +142,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
if(expMax == 0) {
sz = 7;
} else {
sz = expMax+6;
sz = static_cast<int>(expMax) + 6;
}
stream << std::fixed << std::setprecision(4);
}
Expand All @@ -163,9 +164,7 @@ static void printScale(std::ostream & stream, double scale) {
}
static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
{
double scale = 0.0;
int64_t sz = 0;
std::tie(scale, sz) = __printFormat(stream, self);
auto [scale, sz] = __printFormat(stream, self);

__printIndent(stream, indent);
int64_t nColumnPerLine = (linesize-indent)/(sz+1);
Expand Down Expand Up @@ -284,9 +283,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
stream << "[ " << tensor_.toString() << "{}";
} else if(tensor.ndimension() == 1) {
if (tensor.numel() > 0) {
double scale = 0.0;
int64_t sz = 0;
std::tie(scale, sz) = __printFormat(stream, tensor);
auto [scale, sz] = __printFormat(stream, tensor);
if(scale != 1) {
printScale(stream, scale);
}
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/core/List_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using namespace c10;

// NOLINTBEGIN(performance-move-const-arg)
TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
List<string> list;
EXPECT_TRUE(list.empty());
Expand Down Expand Up @@ -1159,3 +1160,4 @@ TEST(ListTest, toTypedList) {
genericList = impl::toList(std::move(stringList));
EXPECT_THROW(c10::impl::toTypedList<int64_t>(std::move(genericList)), c10::Error);
}
// NOLINTEND(performance-move-const-arg)
1 change: 0 additions & 1 deletion aten/src/ATen/core/NamedTensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <ATen/core/NamedTensor.h>

#include <ATen/core/TensorBase.h>
#include <c10/util/C++17.h>

namespace at {

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/core/ivalue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1210,7 +1210,7 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAny(
TypePtr typePtr,
std::vector<c10::Device> devices)
: srcFutures(srcs),
dstFuture(make_intrusive<ivalue::Future>(typePtr, std::move(devices))) {}
dstFuture(make_intrusive<ivalue::Future>(std::move(typePtr), std::move(devices))) {}
std::atomic<bool> done{false};
List<intrusive_ptr<ivalue::Future>> srcFutures;
intrusive_ptr<ivalue::Future> dstFuture;
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/core/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace {

CppFunction::CppFunction(c10::KernelFunction func, c10::optional<c10::impl::CppSignature> cpp_signature, std::unique_ptr<c10::FunctionSchema> schema)
: func_(std::move(func))
, cpp_signature_(std::move(cpp_signature))
, cpp_signature_(cpp_signature)
, schema_(std::move(schema))
, debug_()
{}
Expand Down Expand Up @@ -172,7 +172,7 @@ Library& Library::_def(std::variant<c10::OperatorName, c10::FunctionSchema>&& na
std::move(name),
dispatch_key,
std::move(f.func_),
std::move(f.cpp_signature_),
f.cpp_signature_,
std::move(f.schema_),
debugString(std::move(f.debug_), file_, line_)
)
Expand Down Expand Up @@ -223,7 +223,7 @@ Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify
std::move(name),
dispatch_key,
std::move(f.func_),
std::move(f.cpp_signature_),
f.cpp_signature_,
std::move(f.schema_),
debugString(std::move(f.debug_), file_, line_)
)
Expand Down
32 changes: 32 additions & 0 deletions aten/src/ATen/cuda/CUDAGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>

#include <chrono>
#include <thread>

namespace at::cuda {

static bool _cuda_graphs_debug = false;
constexpr int kSynchronizeBusyWaitMillis = 10;

MempoolId_t graph_pool_handle() {
#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
Expand Down Expand Up @@ -55,6 +59,25 @@ CaptureId_t capture_sequence_id() {
* describes memory management for captures.
*/

std::atomic<int> CUDAGraph::pending_event_queries = 0;

// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
// can be resolved before the capture begins. Note that event queries are not allowed during a
// graph capture in the default capture mode.
void CUDAGraph::inc_pending_event_queries() {
pending_event_queries++;
}

void CUDAGraph::dec_pending_event_queries() {
TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
"Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
pending_event_queries--;
}

int CUDAGraph::num_pending_event_queries() {
return pending_event_queries;
}

CUDAGraph::CUDAGraph()
// CUDAStreams may not be default-constructed.
: capture_stream_(at::cuda::getCurrentCUDAStream()) {
Expand Down Expand Up @@ -115,6 +138,15 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
// due to the capture status being updated _after_ a capture had already started.
c10::cuda::CUDACachingAllocator::beginAllocateStreamToPool(capture_dev_, capture_stream_, mempool_id_);

// At this point, any NCCL watchdogs should be aware that we are in capture mode
// and therefore should not enqueue any additional work that could be event-queried.
// We still must wait on any existing work that has not been cleaned up.
while (num_pending_event_queries()) {
TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
std::this_thread::sleep_for(
std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
}

// cudaStreamCaptureModeGlobal is the most conservative option to
// prevent potentially unsafe CUDA API calls during capture. See
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
Expand Down
7 changes: 7 additions & 0 deletions aten/src/ATen/cuda/CUDAGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAStream.h>

#include <mutex>

namespace at {

struct CUDAGeneratorImpl;
Expand All @@ -19,6 +21,9 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
CUDAGraph();
~CUDAGraph();

static void inc_pending_event_queries();
static void dec_pending_event_queries();
static int num_pending_event_queries();
void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
void capture_end();
void replay();
Expand All @@ -33,6 +38,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
cudaGraphExec_t graph_exec_ = NULL;
#endif

static std::atomic<int> pending_event_queries;

// internal states so reset() can do its best cleaning up
// Set to true in capture_end if cudaStreamEndCapture succeeded
// Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate
Expand Down
26 changes: 20 additions & 6 deletions aten/src/ATen/cuda/CachingHostAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <deque>
#include <future>
#include <memory>
#include <mutex>
#include <set>
Expand Down Expand Up @@ -397,7 +398,7 @@ class CUDAHostAllocator {
(void*)devptr == (void*)ptr,
"Host and device pointer dont match with cudaHostRegister. "
"Please dont use this feature by setting "
"PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
"PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
"");
}

Expand All @@ -412,18 +413,31 @@ class CUDAHostAllocator {
size_t numMapThreads = c10::cuda::CUDACachingAllocator::
CUDAAllocatorConfig::pinned_num_register_threads();
if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
// parallelize the mapping of pages with a threadpool
auto* pool = getThreadPool();
std::vector<std::promise<void>> promises;
std::vector<std::future<void>> futures;
promises.reserve(numMapThreads);
futures.reserve(numMapThreads);

for (size_t i = 0; i < numMapThreads; i++) {
pool->run(std::bind(
&CUDAHostAllocator::mapPagesForRegister,
this,
promises.emplace_back();
futures.push_back(promises[i].get_future());
auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
mapPagesForRegister(
*ptr,
roundSize,
i, // thread task-id
numMapThreads,
pageSize));
pageSize);
// set the promise when mapping pages are done
promises[i].set_value();
};
pool->run(task);
}
for (auto& future : futures) {
future.wait();
}
pool->waitWorkComplete();
} else {
// Map pages in the same thread
mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
Expand Down

0 comments on commit e5236b0

Please sign in to comment.