pytorch · kimishpatel · Sep 1, 2020 · Sep 14, 2020 · Sep 16, 2020 · Sep 17, 2020
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -80,6 +80,7 @@ list(APPEND ATen_VULKAN_TEST_SRCS
 
 list(APPEND ATen_MOBILE_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 # ---[ Send the lists to the parent scope.

diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -0,0 +1,158 @@
+#include <gtest/gtest.h>
+
+#include <c10/core/CPUProfilingAllocator.h>
+#include <ATen/ATen.h>
+
+at::Tensor run_with_control_flow(
+    at::Tensor input,
+    at::Tensor conv_weight,
+    at::Tensor linear_weight,
+    bool cond,
+    std::vector<void*>& pointers,
+    bool record = false,
+    bool validate = false) {
+  if (cond) {
+    input = input * 2;
+  }
+  void* input_ptr = input.data_ptr();
+  auto conv_out = at::conv2d(input, conv_weight);
+  void* conv_out_ptr = input.data_ptr();
+  auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
+  auto output = at::linear(conv_out_flat, linear_weight);
+  if (record) {
+    pointers.push_back(input_ptr);
+    pointers.push_back(conv_out_ptr);
+  }
+  if (validate) {
+    TORCH_CHECK(input_ptr == pointers[0]);
+    TORCH_CHECK(conv_out_ptr == pointers[1]);
+  }
+  return output;
+}
+
+TEST(CPUAllocationPlanTest, with_control_flow) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGaurd profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, true, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode, bool validation_mode) -> bool {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGaurd profile_guard(&plan);
+      output =
+        run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
+    }
+    bool success{true};
+    for (uint64_t i = 0; i < 10; ++i) {
+      bool validation_success;
+      {
+        c10::WithValidateAllocationPlanGaurd
+          validation_guard(&plan, &validation_success);
+        output = run_with_control_flow(
+            a, conv_weight, linear_weight, validation_mode, pointers);
+      }
+      success = success && validation_success;
+    }
+    return success;
+  };
+  ASSERT_FALSE(validate_allocation_plan(false, true));
+  ASSERT_FALSE(validate_allocation_plan(true, false));
+  ASSERT_TRUE(validate_allocation_plan(true, true));
+  ASSERT_TRUE(validate_allocation_plan(false, false));
+}
+
+TEST(CPUAllocationPlanTest, with_profiling_alloc) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGaurd profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, false, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode,
+        bool validation_mode,
+        bool validate_pointers) {
+      pointers.clear();
+      c10::AllocationPlan plan;
+      {
+        c10::WithProfileAllocationsGaurd profile_guard(&plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            record_mode,
+            pointers,
+            false,
+            false);
+      }
+      c10::CPUProfilingAllocator profiling_allocator;
+      {
+        c10::WithProfilingAllocatorGuard
+          profiling_allocator_guard(&profiling_allocator, &plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            validation_mode,
+            pointers,
+            validate_pointers,
+            false);
+      }
+      for (uint64_t i = 0; i < 10; ++i) {
+        {
+          c10::WithProfilingAllocatorGuard
+            profiling_allocator_guard(&profiling_allocator, &plan);
+          output = run_with_control_flow(
+              a,
+              conv_weight,
+              linear_weight,
+              validation_mode,
+              pointers,
+              false,
+              validate_pointers);
+        }
+      }
+  };
+  // When control flow conditions are same between profiling and evaluation
+  // profiling allocator should not throw.
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
+  // Furthermore profiling allocator should return the same pointers
+  // back for the intermediate tensors
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, true));
+
+  // When control flow conditions are different between profiling and evaluation
+  // profiling allocator should throw.
+  ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
+  ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
+}
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
@@ -1,5 +1,6 @@
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/CPUCachingAllocator.h>
+#include <c10/core/CPUProfilingAllocator.h>
 #include <c10/core/DeviceType.h>
 
 // TODO: rename flags to C10
@@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       allocator_ptr->free(pointer);
+    } else if (profiling_allocator_ptr != nullptr) {
+      profiling_allocator_ptr->free(pointer);
     } else {
       c10::free_cpu(pointer);
       // This adds extra cost to freeing memory to the default case when
       // caching allocator is not enabled.
       CPUCachingAllocator::record_free(pointer);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_free(pointer);
+      }
     }
   }
 
@@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
     void* data;
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       data = allocator_ptr->allocate(alloc_size);
+    } else if (profiling_allocator_ptr != nullptr) {
+      data = profiling_allocator_ptr->allocate(alloc_size);
     } else {
       data = c10::alloc_cpu(alloc_size);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_allocation(alloc_size, data);
+      }
     }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {