From 72bc60ebc4788630aa4258d027015b939e8b09ca Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Mon, 19 Oct 2020 21:02:22 +0000 Subject: [PATCH] ProcessGroupNCCL::alltoall_base calls recordStream --- torch/lib/c10d/ProcessGroupNCCL.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 90a858adca96..36880f22b36d 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1435,6 +1435,9 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( at::Tensor& output, ncclComm_t comm, at::cuda::CUDAStream& stream) { + // See [Sync Streams]. + c10::cuda::CUDACachingAllocator::recordStream( + output.storage().data_ptr(), stream); torch::cuda::nccl::all2all( input, output, @@ -1464,6 +1467,9 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( inputSplitSizes, input, &send_lengths, &send_offsets); c10d::computeLengthsAndOffsets( outputSplitSizes, output, &recv_lengths, &recv_offsets); + // See [Sync Streams]. + c10::cuda::CUDACachingAllocator::recordStream( + output.storage().data_ptr(), stream); return ncclAlltoallv( input.data_ptr(), send_lengths.data(),