diff --git a/extension/memory_allocator/CMakeLists.txt b/extension/memory_allocator/CMakeLists.txt
new file mode 100644
index 00000000000..1c3c8a0831c
--- /dev/null
+++ b/extension/memory_allocator/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
+if(CMAKE_TOOLCHAIN_IOS
+   OR CMAKE_TOOLCHAIN_ANDROID
+   OR APPLE
+)
+  # Building a share library on iOS requires code signing On Android we see
+  # duplicated registration when using shared lib
+  add_library(extension_memory_allocator STATIC ${_extension_memory_allocator__srcs})
+else()
+  add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
+endif()
+target_link_libraries(
+  extension_memory_allocator PRIVATE executorch_core)
+target_include_directories(
+  extension_memory_allocator PUBLIC ${_common_include_directories}
+)
+target_compile_options(
+  extension_memory_allocator
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
+
+# Install libraries
+install(
+  TARGETS extension_memory_allocator
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
new file mode 100644
index 00000000000..eac308a944e
--- /dev/null
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -0,0 +1,89 @@
+#include <cstdlib>
+
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+
+namespace executorch::extension {
+
+namespace {
+size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
+  alignment = std::max(alignment, kDefaultAlignment);
+  if (size % alignment != 0) {
+    // Adjust size to the next multiple of alignment
+    // This is needed for aligned_alloc to work
+    return (size + alignment) & ~(alignment - 1);
+  } else {
+    return size;
+  }
+}
+} // namespace
+
+CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
+    : MemoryAllocator(0, nullptr) {
+  max_size_ = max_size;
+  current_size_ = 0;
+}
+
+void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
+  EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
+
+  if (!isPowerOf2(alignment)) {
+    ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+    return nullptr;
+  }
+  size = get_alignment_adjusted_size(size, alignment);
+
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = available_map_.find(size);
+  if (it == available_map_.end() || it->second.empty()) {
+    if (current_size_ + size > max_size_) {
+      // Freeing while holding the lock will cause performance issues
+      // we probably should log how often this happens so as to allow
+      // for calling site to adjust the max_size_ parameter
+      free_cached();
+    }
+    void* ptr = std::aligned_alloc(alignment, size);
+    current_size_ += size;
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to allocate memory");
+      return nullptr;
+    }
+    allocation_map_[ptr] = size;
+    return ptr;
+  }
+  void* ptr = it->second.back();
+  it->second.pop_back();
+  allocation_map_[ptr] = size;
+  return ptr;
+}
+
+void CPUCachingAllocator::free_cached() {
+  // We dont lock mutex_ here because it will cause deadlock otherwise
+  // we could use recursive_mutex but we just design this differently since
+  // free_cache is not a public API anyways
+  for (const auto& it : available_map_) {
+    for (const auto ptr : it.second) {
+      std::free(ptr);
+    }
+  }
+  available_map_.clear();
+}
+
+void CPUCachingAllocator::reset() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  for (auto& it : allocation_map_) {
+    void* ptr = it.first;
+    size_t alloc_size = it.second;
+    // Cache the memory
+    available_map_[alloc_size].push_back(ptr);
+    current_size_ -= alloc_size;
+  }
+  allocation_map_.clear();
+}
+
+CPUCachingAllocator::~CPUCachingAllocator() {
+  // destructor must be called in thread safe manner
+  reset();
+  free_cached();
+}
+
+} // namespace executorch::extension
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
new file mode 100644
index 00000000000..d22f9e38396
--- /dev/null
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <cstddef>
+#include <mutex>
+
+#include <executorch/runtime/core/memory_allocator.h>
+
+#ifdef USE_C10_SMALL_VECTOR
+#include <c10/util/SmallVector.h>
+#else
+#include <vector>
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+#include <c10/util/flat_hash_map.h>
+#else
+#include <unordered_map>
+#endif
+
+/*
+ * CPUCachingAllocator:
+ * This file is copied over from c10/mobile/CPUCachingAllocator.h
+ * It is a thread safe caching allocator.
+ */
+
+namespace executorch::extension {
+
+#ifdef USE_C10_SMALL_VECTOR
+template <typename T, unsigned N>
+using SmallVector = c10::SmallVector<T, N>;
+#else
+template <typename T, unsigned N>
+using SmallVector = std::vector<T>;
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+template <typename KeyType, typename ValueType>
+using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
+#else
+template <typename KeyType, typename ValueType>
+using FlatHashMap = std::unordered_map<KeyType, ValueType>;
+#endif
+
+constexpr size_t kDefaultAlignment = 64;
+class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+ private:
+  void free_cached();
+
+ protected:
+  // Invariants.
+  // New invariants must be written.
+  FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
+  FlatHashMap<void*, size_t> allocation_map_;
+  // Since allocation_map, which is a global instance, is mutated/read via
+  // all public APIs we need a global mutex.
+  std::mutex mutex_;
+  size_t max_size_;
+  size_t current_size_;
+
+ public:
+  /*
+    max_size: Maximum size of memory to cache. Never cache more than that.
+  */
+  CPUCachingAllocator(uint32_t max_size);
+  // Checks the cache to see if allocation of size bytes can be found.
+  // If so return cached memory, else
+  // allocates memory, records it for caching and returns.
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
+  void reset() override;
+  ~CPUCachingAllocator();
+};
+
+} // namespace executorch::extension
diff --git a/extension/memory_allocator/targets.bzl b/extension/memory_allocator/targets.bzl
index d021a4da707..f51b084e4ee 100644
--- a/extension/memory_allocator/targets.bzl
+++ b/extension/memory_allocator/targets.bzl
@@ -20,3 +20,20 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    runtime.cxx_library(
+        name = "cpu_caching_allocator",
+        srcs = [
+            "cpu_caching_malloc_allocator.cpp",
+        ],
+        exported_headers = [
+            "cpu_caching_malloc_allocator.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:memory_allocator",
+        ],
+        visibility = [
+            "//executorch/extension/memory_allocator/test/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
new file mode 100644
index 00000000000..cda60c1b6f3
--- /dev/null
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -0,0 +1,301 @@
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::testing;
+using executorch::extension::CPUCachingAllocator;
+
+constexpr auto kDefaultAlignment = executorch::extension::kDefaultAlignment;
+
+class CPUCachingAllocatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+  }
+};
+
+bool is_aligned(const void* ptr, size_t alignment) {
+  uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  return addr % alignment == 0;
+}
+
+#define EXPECT_ALIGNED(ptr, alignment)        \
+  EXPECT_TRUE(is_aligned((ptr), (alignment))) \
+      << "Pointer " << (ptr) << " is not aligned to " << (alignment)
+
+TEST_F(CPUCachingAllocatorTest, SimpleAllocateSucceeds) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p = allocator.allocate(16);
+  EXPECT_NE(p, nullptr);
+  EXPECT_ALIGNED(p, kDefaultAlignment);
+
+  auto p2 = allocator.allocate(32);
+  EXPECT_NE(p2, nullptr);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+
+  auto p3 = allocator.allocate(64);
+  EXPECT_NE(p3, nullptr);
+  EXPECT_ALIGNED(p3, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, CachingReusesSameSize) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(256);
+  EXPECT_NE(p1, nullptr);
+  EXPECT_ALIGNED(p1, kDefaultAlignment);
+
+  // Reset to return the allocation to the cache
+  allocator.reset();
+
+  // Allocate the same size should reuse the cached pointer
+  auto p2 = allocator.allocate(256);
+  EXPECT_EQ(p1, p2);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, DifferentSizesAllocateDifferentPtrs) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(128);
+  auto p2 = allocator.allocate(256);
+  auto p3 = allocator.allocate(512);
+
+  EXPECT_NE(p1, nullptr);
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p3, nullptr);
+
+  // All pointers should be different
+  EXPECT_NE(p1, p2);
+  EXPECT_NE(p2, p3);
+  EXPECT_NE(p1, p3);
+
+  EXPECT_ALIGNED(p1, kDefaultAlignment);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+  EXPECT_ALIGNED(p3, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, ResetCachesAllocations) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(256);
+  auto p2 = allocator.allocate(256);
+  EXPECT_NE(p1, p2);
+
+  allocator.reset();
+
+  // After reset, both cached allocations should be available
+  auto p3 = allocator.allocate(256);
+  auto p4 = allocator.allocate(256);
+
+  // p3 should be one of the cached pointers (either p1 or p2)
+  EXPECT_TRUE((p3 == p1) || (p3 == p2));
+  EXPECT_TRUE((p4 == p1) || (p4 == p2));
+  EXPECT_NE(p3, p4);
+}
+
+TEST_F(CPUCachingAllocatorTest, AlignmentParameter) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  std::vector<size_t> alignments = {
+      kDefaultAlignment,
+      kDefaultAlignment * 2,
+      kDefaultAlignment * 4,
+      kDefaultAlignment * 8,
+  };
+
+  for (size_t alignment : alignments) {
+    auto p = allocator.allocate(256, alignment);
+    EXPECT_NE(p, nullptr);
+    EXPECT_ALIGNED(p, alignment);
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, InvalidAlignmentFails) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Should fail because alignment is not a power of 2
+  std::vector<size_t> invalid_alignments = {0, 5, 6, 12, 34};
+  for (auto alignment : invalid_alignments) {
+    auto p = allocator.allocate(256, alignment);
+    EXPECT_EQ(p, nullptr);
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, MaxSizeRespected) {
+  constexpr size_t kMaxSize = 1024; // 1KB max
+  CPUCachingAllocator allocator(kMaxSize);
+
+  // Allocate close to the max size
+  auto p1 = allocator.allocate(512);
+  EXPECT_NE(p1, nullptr);
+
+  auto p2 = allocator.allocate(512);
+  EXPECT_NE(p2, nullptr);
+
+  // This should trigger cache freeing since we would exceed max_size
+  auto p3 = allocator.allocate(512);
+  EXPECT_NE(p3, nullptr);
+}
+
+TEST_F(CPUCachingAllocatorTest, MultipleAllocationsAndResets) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  for (int i = 0; i < 5; ++i) {
+    auto p1 = allocator.allocate(256);
+    auto p2 = allocator.allocate(512);
+    auto p3 = allocator.allocate(1024);
+
+    EXPECT_NE(p1, nullptr);
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p3, nullptr);
+
+    allocator.reset();
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, MemoryWriteability) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  const size_t size = 1024;
+  auto p = allocator.allocate(size);
+  EXPECT_NE(p, nullptr);
+
+  // Write to allocated memory
+  memset(p, 0x55, size);
+
+  // Read back and verify
+  uint8_t* bytes = reinterpret_cast<uint8_t*>(p);
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_EQ(bytes[i], 0x55);
+  }
+
+  allocator.reset();
+}
+
+TEST_F(CPUCachingAllocatorTest, CachingWithMultipleSizes) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Allocate various sizes
+  auto p1 = allocator.allocate(128);
+  auto p2 = allocator.allocate(256);
+  auto p3 = allocator.allocate(512);
+  auto p4 = allocator.allocate(128);
+
+  // Reset to cache them
+  allocator.reset();
+
+  // Allocate same sizes - should reuse cached pointers
+  auto p5 = allocator.allocate(128);
+  auto p6 = allocator.allocate(256);
+  auto p7 = allocator.allocate(512);
+
+  EXPECT_TRUE((p5 == p1) || (p5 == p4));
+  EXPECT_EQ(p6, p2);
+  EXPECT_EQ(p7, p3);
+}
+
+TEST_F(CPUCachingAllocatorTest, ThreadSafety) {
+  CPUCachingAllocator allocator(4 * 1024 * 1024); // 4MB max size
+
+  std::vector<std::thread> threads;
+  std::vector<void*> allocated_ptrs;
+  std::mutex ptrs_mutex;
+
+  const int num_threads = 4;
+  const int allocations_per_thread = 10;
+
+  // Lambda function for thread work
+  auto thread_work = [&]() {
+    for (int i = 0; i < allocations_per_thread; ++i) {
+      size_t size = (i + 1) * 64;
+      auto p = allocator.allocate(size);
+      EXPECT_NE(p, nullptr);
+      EXPECT_ALIGNED(p, kDefaultAlignment);
+
+      {
+        std::lock_guard<std::mutex> guard(ptrs_mutex);
+        allocated_ptrs.push_back(p);
+      }
+    }
+
+    // Reset in each thread
+    allocator.reset();
+  };
+
+  // Create threads
+  for (int i = 0; i < num_threads; ++i) {
+    threads.emplace_back(thread_work);
+  }
+
+  // Wait for all threads to finish
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  // Verify all allocations were valid
+  EXPECT_EQ(allocated_ptrs.size(), num_threads * allocations_per_thread);
+}
+
+TEST_F(CPUCachingAllocatorTest, LargeAllocation) {
+  CPUCachingAllocator allocator(10 * 1024 * 1024); // 10MB max size
+
+  const size_t large_size = 1024 * 1024; // 1MB allocation
+  auto p = allocator.allocate(large_size);
+  EXPECT_NE(p, nullptr);
+  EXPECT_ALIGNED(p, kDefaultAlignment);
+
+  // Write and verify
+  memset(p, 0xAA, large_size);
+  uint8_t* bytes = reinterpret_cast<uint8_t*>(p);
+  for (size_t i = 0; i < 1000; ++i) { // Sample check
+    EXPECT_EQ(bytes[i], 0xAA);
+  }
+
+  allocator.reset();
+
+  // Re-allocate same size should reuse cached pointer
+  auto p2 = allocator.allocate(large_size);
+  EXPECT_EQ(p, p2);
+}
+
+TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Test that allocation sizes get properly aligned
+  auto p1 = allocator.allocate(100, 256); // Size not aligned to 256
+  EXPECT_NE(p1, nullptr);
+  EXPECT_ALIGNED(p1, 256);
+
+  auto p2 = allocator.allocate(100, 256);
+  // Should not get cached pointer since size was adjusted during first
+  // allocation
+  allocator.reset();
+
+  auto p3 = allocator.allocate(100, 256);
+  // Should reuse p1 due to alignment adjustment
+  EXPECT_EQ(p1, p3);
+}
+
+TEST_F(CPUCachingAllocatorTest, ResetMultipleTimes) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  for (int i = 0; i < 3; ++i) {
+    auto p = allocator.allocate(512);
+    EXPECT_NE(p, nullptr);
+    allocator.reset();
+
+    auto p2 = allocator.allocate(512);
+    EXPECT_EQ(p, p2);
+  }
+}
diff --git a/extension/memory_allocator/test/targets.bzl b/extension/memory_allocator/test/targets.bzl
index 77fb6936a3a..5855bee5c14 100644
--- a/extension/memory_allocator/test/targets.bzl
+++ b/extension/memory_allocator/test/targets.bzl
@@ -15,3 +15,13 @@ def define_common_targets():
             "//executorch/extension/memory_allocator:malloc_memory_allocator",
         ],
     )
+
+    runtime.cxx_test(
+        name = "cpu_caching_malloc_allocator_test",
+        srcs = [
+            "cpu_caching_malloc_allocator_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/memory_allocator:cpu_caching_allocator",
+        ],
+    )
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index 8d8893f7454..0529c814f14 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -337,6 +337,10 @@ EXTENSION_FLAT_TENSOR_SRCS = [
     "extension/flat_tensor/serialize/flat_tensor_header.cpp",
 ]
 
+EXTENSION_MEMORY_ALLOCATOR_SRCS = [
+    "extension/memory_allocator/cpu_caching_malloc_allocator.cpp",
+]
+
 EXTENSION_MODULE_SRCS = [
     "extension/module/module.cpp",
 ]
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index 32d3d8b554f..e838e62c582 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -398,6 +398,7 @@ function(executorch_load_build_variables)
       EXTENSION_DATA_LOADER_SRCS
       EXTENSION_EVALUE_UTIL_SRCS
       EXTENSION_FLAT_TENSOR_SRCS
+      EXTENSION_MEMORY_ALLOCATOR_SRCS
       EXTENSION_MODULE_SRCS
       EXTENSION_NAMED_DATA_MAP_SRCS
       EXTENSION_RUNNER_UTIL_SRCS
@@ -431,6 +432,7 @@ function(executorch_load_build_variables)
       _extension_data_loader__srcs
       _extension_evalue_util__srcs
       _extension_flat_tensor__srcs
+      _extension_memory_allocator__srcs
       _extension_module__srcs
       _extension_named_data_map__srcs
       _extension_runner_util__srcs