Skip to content

Commit

Permalink
ProcessGroupNCCL::alltoall_base calls recordStream
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffdaily committed Oct 19, 2020
1 parent 172ed51 commit 72bc60e
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions torch/lib/c10d/ProcessGroupNCCL.cpp
Expand Up @@ -1435,6 +1435,9 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
at::Tensor& output,
ncclComm_t comm,
at::cuda::CUDAStream& stream) {
// See [Sync Streams].
c10::cuda::CUDACachingAllocator::recordStream(
output.storage().data_ptr(), stream);
torch::cuda::nccl::all2all(
input,
output,
Expand Down Expand Up @@ -1464,6 +1467,9 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
inputSplitSizes, input, &send_lengths, &send_offsets);
c10d::computeLengthsAndOffsets(
outputSplitSizes, output, &recv_lengths, &recv_offsets);
// See [Sync Streams].
c10::cuda::CUDACachingAllocator::recordStream(
output.storage().data_ptr(), stream);
return ncclAlltoallv(
input.data_ptr(),
send_lengths.data(),
Expand Down

0 comments on commit 72bc60e

Please sign in to comment.