Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profiling allocator for mobile. #43951

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions aten/src/ATen/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ list(APPEND ATen_VULKAN_TEST_SRCS

list(APPEND ATen_MOBILE_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)

# ---[ Send the lists to the parent scope.
Expand Down
158 changes: 158 additions & 0 deletions aten/src/ATen/test/cpu_profiling_allocator_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#include <gtest/gtest.h>

#include <c10/core/CPUProfilingAllocator.h>
#include <ATen/ATen.h>

at::Tensor run_with_control_flow(
at::Tensor input,
at::Tensor conv_weight,
at::Tensor linear_weight,
bool cond,
std::vector<void*>& pointers,
bool record = false,
bool validate = false) {
if (cond) {
input = input * 2;
}
void* input_ptr = input.data_ptr();
auto conv_out = at::conv2d(input, conv_weight);
void* conv_out_ptr = input.data_ptr();
auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
auto output = at::linear(conv_out_flat, linear_weight);
if (record) {
pointers.push_back(input_ptr);
pointers.push_back(conv_out_ptr);
}
if (validate) {
TORCH_CHECK(input_ptr == pointers[0]);
TORCH_CHECK(conv_out_ptr == pointers[1]);
}
return output;
}

TEST(CPUAllocationPlanTest, with_control_flow) {
at::Tensor a = at::rand({23, 16, 16, 16});
at::Tensor conv_weight = at::rand({16, 16, 3, 3});
// output shape
// 23, 16, 14, 14
// Flattened shape = 23, 3136
at::Tensor linear_weight = at::rand({32, 3136});
at::Tensor output;
std::vector<void*> pointers;

auto valid_allocation_plan = [&]() {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGaurd profile_guard(&plan);
output = run_with_control_flow(
a, conv_weight, linear_weight, true, pointers);
}
};
ASSERT_NO_THROW(valid_allocation_plan());

auto validate_allocation_plan =
[&](bool record_mode, bool validation_mode) -> bool {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGaurd profile_guard(&plan);
output =
run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
}
bool success{true};
for (uint64_t i = 0; i < 10; ++i) {
bool validation_success;
{
c10::WithValidateAllocationPlanGaurd
validation_guard(&plan, &validation_success);
output = run_with_control_flow(
a, conv_weight, linear_weight, validation_mode, pointers);
}
success = success && validation_success;
}
return success;
};
ASSERT_FALSE(validate_allocation_plan(false, true));
ASSERT_FALSE(validate_allocation_plan(true, false));
ASSERT_TRUE(validate_allocation_plan(true, true));
ASSERT_TRUE(validate_allocation_plan(false, false));
}

TEST(CPUAllocationPlanTest, with_profiling_alloc) {
at::Tensor a = at::rand({23, 16, 16, 16});
at::Tensor conv_weight = at::rand({16, 16, 3, 3});
// output shape
// 23, 16, 14, 14
// Flattened shape = 23, 3136
at::Tensor linear_weight = at::rand({32, 3136});
at::Tensor output;
std::vector<void*> pointers;

auto valid_allocation_plan = [&]() {
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGaurd profile_guard(&plan);
output = run_with_control_flow(
a, conv_weight, linear_weight, false, pointers);
}
};
ASSERT_NO_THROW(valid_allocation_plan());

auto validate_allocation_plan =
[&](bool record_mode,
bool validation_mode,
bool validate_pointers) {
pointers.clear();
c10::AllocationPlan plan;
{
c10::WithProfileAllocationsGaurd profile_guard(&plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
record_mode,
pointers,
false,
false);
}
c10::CPUProfilingAllocator profiling_allocator;
{
c10::WithProfilingAllocatorGuard
profiling_allocator_guard(&profiling_allocator, &plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
validation_mode,
pointers,
validate_pointers,
false);
}
for (uint64_t i = 0; i < 10; ++i) {
{
c10::WithProfilingAllocatorGuard
profiling_allocator_guard(&profiling_allocator, &plan);
output = run_with_control_flow(
a,
conv_weight,
linear_weight,
validation_mode,
pointers,
false,
validate_pointers);
}
}
};
// When control flow conditions are same between profiling and evaluation
// profiling allocator should not throw.
ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
// Furthermore profiling allocator should return the same pointers
// back for the intermediate tensors
ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
ASSERT_NO_THROW(validate_allocation_plan(false, false, true));

// When control flow conditions are different between profiling and evaluation
// profiling allocator should throw.
ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
}
15 changes: 15 additions & 0 deletions c10/core/CPUAllocator.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <c10/core/CPUAllocator.h>
#include <c10/core/CPUCachingAllocator.h>
#include <c10/core/CPUProfilingAllocator.h>
#include <c10/core/DeviceType.h>

// TODO: rename flags to C10
Expand Down Expand Up @@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
// TODO: enable with better TLS support on mobile
// profiledCPUMemoryReporter().Delete(pointer);
auto allocator_ptr = GetThreadLocalCachingAllocator();
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason this lives as a separate concept from caching allocator? Given that there is never a situation where BOTH allocators are used, it seems like having two separate thread local allocators is unnecessary?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, the use of thread local allocator here seems super questionable. If pointers escape from the thread local scope of the allocator, you will deallocate them with the wrong allocator here (which will probably cause a segfault). And we already have machinery for making sure you associate a correct deleter with each allocation (that's what DataPtr is for), so I'm not sure why you have to do it this way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason this lives as a separate concept from caching allocator? Given that there is never a situation where BOTH allocators are used, it seems like having two separate thread local allocators is unnecessary?

Thanks Ed for the comment. Idea with profiling allocator is that you create a plan offline and use that plan at runtime, throwing exception when it fails. This PR introduces only the first part, where allocation plan does not have to be serialized with model but later we want to introduce those changes so as to not have to run profiling and allocation planning on mobile. Caching allocator on the other hand is model independent and it allows user to implements its own allocator by making caching allocator a virtual class.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If pointers escape from the thread local scope of the allocator

Can you explain what do you mean by this?

If you mean that pointer is allocated within the thread local scope but deallocated outside, the correct behavior should be that such an allocation is not managed by profiling allocator. If there is a bug that does not ensure that behavior we need to fix that, but the design is that only allocations that are allocated and freed within this thread local scope are managed by profiling allocator.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduces only the first part, where allocation plan does not have to be serialized with model but later we want to introduce those changes so as to not have to run profiling and allocation planning on mobile.

OK, I'll trust you on this. My general point was that if you only have one thread local allocator, you might be able to manage this at a higher level, in the selection of what allocator you register to the thread local at any given point. But maybe this is not so easy to do in the future.

if (allocator_ptr != nullptr) {
allocator_ptr->free(pointer);
} else if (profiling_allocator_ptr != nullptr) {
profiling_allocator_ptr->free(pointer);
} else {
c10::free_cpu(pointer);
// This adds extra cost to freeing memory to the default case when
// caching allocator is not enabled.
CPUCachingAllocator::record_free(pointer);
auto allocation_planner = GetThreadLocalAllocationPlanner();
if (allocation_planner != nullptr) {
allocation_planner->record_free(pointer);
}
}
}

Expand All @@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
void* data;
auto allocator_ptr = GetThreadLocalCachingAllocator();
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
if (allocator_ptr != nullptr) {
data = allocator_ptr->allocate(alloc_size);
} else if (profiling_allocator_ptr != nullptr) {
data = profiling_allocator_ptr->allocate(alloc_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continuing from https://github.com/pytorch/pytorch/pull/43951/files#r492446863 the way to fix this problem is to change the API of the allocate indirection here to return a DataPtr instead of a void* pointer. Then the thread local profiling allocator that was in scope at this time can provide an appropriate deleter (rather than having to do the lookup again upon deletion). You can then dispense with the deleter static method entirely.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So as I said in my previous comments profiling allocator manages only those allocations which are allocated + freed within the scope. However, your point might still be valid, so I need to think if this is a better way to do the same thing. I will make changes accordingly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can believe that under your intended usage pattern, the failure case that I had described would not happen. But if that's the precondition, you are at very least obligated to document the precondition (I don't think it is documented right now); and furthermore, the framework goes out of its way to make it easy for you to do things correctly even the more general situation when allocations escape the scope. (In some sense, the latter point makes it more important to handle this correctly, as most people will assume your described precondition doesn't need to hold for all allocators!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can believe that under your intended usage pattern, the failure case that I had described would not happen.

To make sure we understand each other. What I mean is that if you use profiling allocator and utils to generate allocation plan and use that plan for allocation, it will not manage allocations whose lifetimes are outside the scope of the allocator. I will add comments to the effect, such that user would know that not all allocations made inside the scope are managed by profiling allocator.

In some sense, the latter point makes it more important to handle this correctly, as most people will assume your described precondition doesn't need to hold for all allocators.

I can agree with this. But when you say "handle this correctly" do you mean that, "handle correctly even when the precondition (that allocations with lifetime outside the scope are not managed by the allocator) does not need to be held true"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I mean is that if you use profiling allocator and utils to generate allocation plan and use that plan for allocation, it will not manage allocations whose lifetimes are outside the scope of the allocator.

Yes. Though, I thought I saw some comments that implied that the profiling process can also detect allocations that escape the scope (you assign them int max ending, right?)

But when you say "handle this correctly" do you mean that, "handle correctly even when the precondition (that allocations with lifetime outside the scope are not managed by the allocator) does not need to be held true"?

Yes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though, I thought I saw some comments that implied that the profiling process can also detect allocations that escape the scope (you assign them int max ending, right?)

That is correct.

} else {
data = c10::alloc_cpu(alloc_size);
auto allocation_planner = GetThreadLocalAllocationPlanner();
if (allocation_planner != nullptr) {
allocation_planner->record_allocation(alloc_size, data);
}
}
// profiledCPUMemoryReporter().New(data, alloc_size);
return {
Expand Down