Skip to content

Commit

Permalink
Update CUDA out of memory mesage with private pool info
Browse files Browse the repository at this point in the history
ghstack-source-id: df2d76357eac5113798fabd7f02bc6eaeb41afca
Pull Request resolved: #124673
  • Loading branch information
isuruf committed Apr 23, 2024
1 parent bb37910 commit 93d1fa6
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions c10/cuda/CUDACachingAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,26 @@ class DeviceCachingAllocator {
.current;
auto observers_local = oom_observers_;

size_t allocated_in_private_pools = 0;
auto get_size_block = [](const BlockPool& pool) {
size_t res = 0;
for (const auto& block : pool.blocks) {
res += block->size;
}
return res;
};
for (const auto& p : graph_pools) {
allocated_in_private_pools += get_size_block(p.second->large_blocks);
allocated_in_private_pools += get_size_block(p.second->small_blocks);
}

std::string private_pool_msg;

if (allocated_in_private_pools > 0) {
private_pool_msg = "with " + format_size(allocated_in_private_pools) +
" allocated in private pools (eg: CUDAGraphs), ";
}

// Make sure we do not have the device lock before calling our
// observers which might need hold the GIL
// It is safe to release at this point because will no longer
Expand Down Expand Up @@ -1157,9 +1177,12 @@ class DeviceCachingAllocator {
" is free. ",
proc_info,
"Of the allocated memory ",
format_size(allocated_bytes),
" is allocated by PyTorch, and ",
format_size(reserved_bytes - allocated_bytes),
format_size(allocated_bytes + allocated_in_private_pools),
" is allocated by PyTorch, ",
private_pool_msg,
"and ",
format_size(
reserved_bytes - allocated_bytes - allocated_in_private_pools),
" is reserved by PyTorch but unallocated.",
" If reserved but unallocated memory is large try setting",
" PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
Expand Down

0 comments on commit 93d1fa6

Please sign in to comment.