From cedfa4ccd89d3b2869e8433cc8a02ba6afa8e8c2 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 28 Jan 2021 10:47:48 -0800
Subject: [PATCH] Make DeviceCachingAllocator's error handling more defensive
 and a bit easier to read (#51158)

Summary:
^
Currently, `alloc_block`'s error handling has a couple (imo) minor flaws.  It might clear the error state even if the error had nothing to do with memory allocation. It might also clear the error state even if it didn't attempt a cudaMalloc, meaning it might clear an error state that came from some completely unrelated earlier cuda call.

The diffs and comments are the best explanation of my preferred (new) error-checking policy.

The diffs add very little work to the common (successful, allocation satisfied by existing block) hot path.  Most of the additional logic occurs in `alloc_block`, which is a slow path anyway (it tries cudaMalloc).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/51158

Reviewed By: malfet, heitorschueroff

Differential Revision: D26101515

Pulled By: ezyang

fbshipit-source-id: 6b447f1770974a04450376afd9726be87af83c48
---
 c10/cuda/CUDACachingAllocator.cpp | 127 +++++++++++++++++-------------
 1 file changed, 72 insertions(+), 55 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 493296248e5b..1481620ded29 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -57,12 +57,12 @@ namespace {
 
 using stream_set = std::unordered_set<cuda::CUDAStream>;
 
-constexpr size_t kMinBlockSize = 512;       // all sizes are rounded to at least 512 bytes
-constexpr size_t kSmallSize = 1048576;      // largest "small" allocation is 1 MiB
-constexpr size_t kSmallBuffer = 2097152;    // "small" allocations are packed in 2 MiB blocks
-constexpr size_t kLargeBuffer = 20971520;   // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kMinBlockSize  =      512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize     =  1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer   =  2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer   = 20971520; // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152;     // round up large allocations to 2 MiB
+constexpr size_t kRoundLarge    =  2097152; // round up large allocations to 2 MiB
 
 typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
 
@@ -242,56 +242,57 @@ class DeviceCachingAllocator {
       // Free all non-split cached blocks and retry alloc.
       || (free_cached_blocks() && alloc_block(params, true));
 
-    TORCH_INTERNAL_ASSERT((!block_found && params.err != cudaSuccess) || params.block);
     if (!block_found) {
-      if (params.err == cudaErrorMemoryAllocation) {
-        size_t device_free;
-        size_t device_total;
-        C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-        std::string allowed_info;
-
-        if (set_fraction) {
-          allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
-        }
+      // For any error code other than cudaErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
 
-        stats.num_ooms += 1;
-
-        // "total capacity": total global memory on GPU
-        // "allowed": memory is allowed to use, which set by fraction.
-        // "already allocated": memory allocated by the program using the
-        //                      caching allocator
-        // "free": free memory as reported by the CUDA API
-        // "cached": memory held by the allocator but not used by the program
-        //
-        // The "allocated" amount  does not include memory allocated outside
-        // of the caching allocator, such as memory allocated by other programs
-        // or memory held by the driver.
-        //
-        // The sum of "allocated" + "free" + "cached" may be less than the
-        // total capacity due to memory held by the driver and usage by other
-        // programs.
-        //
-        // Note that at this point free_cached_blocks has already returned all
-        // possible "cached" memory to the driver. The only remaining "cached"
-        // memory is split from a larger block that is partially in-use.
-        TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
-          "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
-          " (GPU ", device, "; ",
-          format_size(device_total), " total capacity; ",
-          format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-          " already allocated; ",
-          format_size(device_free), " free; ",
-          allowed_info,
-          format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-          " reserved in total by PyTorch)");
-      } else {
-        C10_CUDA_CHECK(params.err);
+      size_t device_free;
+      size_t device_total;
+      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
       }
+
+      stats.num_ooms += 1;
+
+      // "total capacity": total global memory on GPU
+      // "allowed": memory is allowed to use, which set by fraction.
+      // "already allocated": memory allocated by the program using the
+      //                      caching allocator
+      // "free": free memory as reported by the CUDA API
+      // "cached": memory held by the allocator but not used by the program
+      //
+      // The "allocated" amount  does not include memory allocated outside
+      // of the caching allocator, such as memory allocated by other programs
+      // or memory held by the driver.
+      //
+      // The sum of "allocated" + "free" + "cached" may be less than the
+      // total capacity due to memory held by the driver and usage by other
+      // programs.
+      //
+      // Note that at this point free_cached_blocks has already returned all
+      // possible "cached" memory to the driver. The only remaining "cached"
+      // memory is split from a larger block that is partially in-use.
+      TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
+        "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
+        " (GPU ", device, "; ",
+        format_size(device_total), " total capacity; ",
+        format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " already allocated; ",
+        format_size(device_free), " free; ",
+        allowed_info,
+        format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " reserved in total by PyTorch)");
     }
 
+    TORCH_INTERNAL_ASSERT(params.err == cudaSuccess &&
+                          params.block != nullptr &&
+                          params.block->ptr != nullptr);
     Block* block = params.block;
     Block* remaining = nullptr;
-    TORCH_INTERNAL_ASSERT(block);
 
     const bool already_split = block->is_split();
     if (should_split(block, size)) {
@@ -647,22 +648,36 @@ class DeviceCachingAllocator {
   }
 
   bool alloc_block(AllocParams& p, bool isRetry) {
+    // Defensively checks for preexisting CUDA error state.
+    C10_CUDA_CHECK(cudaGetLastError());
+
     size_t size = p.alloc_size;
     void* ptr;
 
     if (isRetry) {
       stats.num_alloc_retries += 1;
     }
+
     if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
       p.err = cudaErrorMemoryAllocation;
+      return false;
     } else {
       p.err = cudaMalloc(&ptr, size);
-    }
-
-    if (p.err != cudaSuccess) {
-      if (!isRetry || p.err == cudaErrorMemoryAllocation)
-        cudaGetLastError();  // clear CUDA error
-      return false;
+      if (p.err != cudaSuccess) {
+        if (p.err == cudaErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear CUDA's
+          //   internal error state.
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH will take
+          //   over to throw a helpful exception. The user can choose to catch the exception,
+          //   free some stuff in their script, and attempt their allocation again.
+          //   In this case, we can also forgive and clear CUDA's internal error state.
+          cudaGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw immediately.
+          C10_CUDA_CHECK(p.err);
+        }
+        return false;
+      }
     }
 
     total_allocated_memory += size;
@@ -670,7 +685,9 @@ class DeviceCachingAllocator {
     update_stat_array(stats.segment, 1, p.stat_types);
     update_stat_array(stats.reserved_bytes, size, p.stat_types);
 
-    return (p.block != nullptr);
+    // p.block came from new, not cudaMalloc.  It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    return true;
   }
 
   bool free_cached_blocks()