Skip to content

Commit

Permalink
Profiling allocator for mobile.
Browse files Browse the repository at this point in the history
Summary:
AllocationPlan: Stores the sequence of allocations, their sizes
                and liftime of the allocations. Along with this
                it also stores the total size of a single memory
                blob, total_size, required to satisfy all the allocations.
                It also stores the offsets in the blob, of size
                total_size, corresponding to each allocation.
                Thus allocation plan contains:
                - allocation sizes
                - allocation lifetimes
                - allocation offsets
                - total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
                  it ups with plan, i.e. sizes, lifetimes, offsets,
                  total size.
                  This is done via WithProfileAllocationsGuard which
                  takes in AllocationPlan* and constructs
                  AllocationPlanner* and set the thread local
                  allocation_planner to it.
                  MobileCPUAllocator profiles allocations via
                  allocation_planner.
                  In WithValidateAllocationsGuard, allocations profiled
                  in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.

Test Plan:
cpu_profiling_allocator_test on mobile.

Reviewers:

Subscribers:

Tasks:

Tags:

ghstack-source-id: 1c1b2d3d87130a57431a9455b4cb18a4935bdbd5
Pull Request resolved: #43951
  • Loading branch information
kimishpatel committed Oct 6, 2020
1 parent 45ddeb5 commit e85c4a4
Show file tree
Hide file tree
Showing 5 changed files with 744 additions and 1 deletion.
4 changes: 3 additions & 1 deletion aten/src/ATen/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)

list(APPEND ATen_MOBILE_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)

list(APPEND ATen_VEC256_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
)

# Caffe2 specific tests
Expand Down
167 changes: 167 additions & 0 deletions aten/src/ATen/test/cpu_profiling_allocator_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include <gtest/gtest.h>

#include <c10/mobile/CPUProfilingAllocator.h>
#include <ATen/ATen.h>

at::Tensor run_with_control_flow(
at::Tensor input,
at::Tensor conv_weight,
at::Tensor linear_weight,
bool cond,
std::vector<void*>& pointers,
bool record = false,
bool validate = false) {
if (cond) {
input = input * 2;
}
void* input_ptr = input.data_ptr();
auto conv_out = at::conv2d(input, conv_weight);
void* conv_out_ptr = input.data_ptr();
auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
auto output = at::linear(conv_out_flat, linear_weight);
if (record) {
pointers.push_back(input_ptr);
pointers.push_back(conv_out_ptr);
}
if (validate) {
TORCH_CHECK(input_ptr == pointers[0]);
TORCH_CHECK(conv_out_ptr == pointers[1]);
}
return output;
}

TEST(CPUAllocationPlanTest, with_control_flow) {
at::Tensor a = at::rand({23, 16, 16, 16});
at::Tensor conv_weight = at::rand({16, 16, 3, 3});
// output shape
// 23, 16, 14, 14
// Flattened shape = 23, 3136
at::Tensor linear_weight = at::rand({32, 3136});
at::Tensor output;
std::vector<void*> pointers;

auto valid_allocation_plan = [&]() {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGuard profile_guard(&plan);
output = run_with_control_flow(
a, conv_weight, linear_weight, true, pointers);
}
};
ASSERT_NO_THROW(valid_allocation_plan());

auto validate_allocation_plan =
[&](bool record_mode, bool validation_mode) -> bool {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGuard profile_guard(&plan);
output =
run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
}
bool success{true};
for (uint64_t i = 0; i < 10; ++i) {
bool validation_success;
{
c10::WithValidateAllocationPlanGuard
validation_guard(&plan, &validation_success);
output = run_with_control_flow(
a, conv_weight, linear_weight, validation_mode, pointers);
}
success = success && validation_success;
}
return success;
};
ASSERT_FALSE(validate_allocation_plan(false, true));
ASSERT_FALSE(validate_allocation_plan(true, false));
ASSERT_TRUE(validate_allocation_plan(true, true));
ASSERT_TRUE(validate_allocation_plan(false, false));
}

TEST(CPUAllocationPlanTest, with_profiling_alloc) {
at::Tensor a = at::rand({23, 16, 16, 16});
at::Tensor conv_weight = at::rand({16, 16, 3, 3});
// output shape
// 23, 16, 14, 14
// Flattened shape = 23, 3136
at::Tensor linear_weight = at::rand({32, 3136});
at::Tensor output;
std::vector<void*> pointers;

auto valid_allocation_plan = [&]() {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGuard profile_guard(&plan);
output = run_with_control_flow(
a, conv_weight, linear_weight, false, pointers);
}
};
ASSERT_NO_THROW(valid_allocation_plan());

auto validate_allocation_plan =
[&](bool record_mode,
bool validation_mode,
bool validate_pointers) {
pointers.clear();
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGuard profile_guard(&plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
record_mode,
pointers,
false,
false);
}
c10::CPUProfilingAllocator profiling_allocator;
{
c10::WithProfilingAllocatorGuard
profiling_allocator_guard(&profiling_allocator, &plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
validation_mode,
pointers,
validate_pointers,
false);
}
for (uint64_t i = 0; i < 10; ++i) {
{
c10::WithProfilingAllocatorGuard
profiling_allocator_guard(&profiling_allocator, &plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
validation_mode,
pointers,
false,
validate_pointers);
}
}
};
// When control flow conditions are same between profiling and evaluation
// profiling allocator should not throw.
ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
// Furthermore profiling allocator should return the same pointers
// back for the intermediate tensors
ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
ASSERT_NO_THROW(validate_allocation_plan(false, false, true));

// When control flow conditions are different between profiling and evaluation
// profiling allocator should throw.
ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
}

int main(int argc, char* argv[]) {
// At the moment caching allocator is only exposed to mobile cpu allocator.
#ifdef C10_MOBILE
::testing::InitGoogleTest(&argc, argv);
at::manual_seed(42);
return RUN_ALL_TESTS();
#endif /* C10_Mobile */
}
15 changes: 15 additions & 0 deletions c10/core/CPUAllocator.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <c10/core/CPUAllocator.h>
#include <c10/core/DeviceType.h>
#include <c10/mobile/CPUCachingAllocator.h>
#include <c10/mobile/CPUProfilingAllocator.h>

// TODO: rename flags to C10
C10_DEFINE_bool(
Expand Down Expand Up @@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
// TODO: enable with better TLS support on mobile
// profiledCPUMemoryReporter().Delete(pointer);
auto allocator_ptr = GetThreadLocalCachingAllocator();
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
if (allocator_ptr != nullptr) {
allocator_ptr->free(pointer);
} else if (profiling_allocator_ptr != nullptr) {
profiling_allocator_ptr->free(pointer);
} else {
c10::free_cpu(pointer);
// This adds extra cost to freeing memory to the default case when
// caching allocator is not enabled.
CPUCachingAllocator::record_free(pointer);
auto allocation_planner = GetThreadLocalAllocationPlanner();
if (allocation_planner != nullptr) {
allocation_planner->record_free(pointer);
}
}
}

Expand All @@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
void* data;
auto allocator_ptr = GetThreadLocalCachingAllocator();
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
if (allocator_ptr != nullptr) {
data = allocator_ptr->allocate(alloc_size);
} else if (profiling_allocator_ptr != nullptr) {
data = profiling_allocator_ptr->allocate(alloc_size);
} else {
data = c10::alloc_cpu(alloc_size);
auto allocation_planner = GetThreadLocalAllocationPlanner();
if (allocation_planner != nullptr) {
allocation_planner->record_allocation(alloc_size, data);
}
}
// profiledCPUMemoryReporter().New(data, alloc_size);
return {
Expand Down

0 comments on commit e85c4a4

Please sign in to comment.