diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index b68043f7feb..a54c47e979d 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -42,3 +42,27 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
+
+    cpp_unittest(
+        name = "test_op__device_copy",
+        srcs = ["test_op__device_copy.cpp"],
+        deps = [
+            "//executorch/backends/cuda/runtime:cuda_backend",
+            "//executorch/kernels/portable:generated_lib",
+            "//executorch/kernels/portable:generated_lib_headers",
+            "//executorch/kernels/portable/cpu:op__device_copy",
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+        preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+        keep_gpu_sections = True,
+        remote_execution = re_test_utils.remote_execution(
+            platform = "gpu-remote-execution",
+        ),
+    )
diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
new file mode 100644
index 00000000000..4e5c5a099b7
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/kernels/portable/Functions.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \
+    defined(__SANITIZE_ADDRESS__)
+#include <sanitizer/lsan_interface.h>
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1
+#else
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0
+#endif
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+struct CudaDeleter {
+  void operator()(void* ptr) const {
+    if (ptr != nullptr) {
+      cudaFree(ptr);
+    }
+  }
+};
+
+using CudaPtr = std::unique_ptr<void, CudaDeleter>;
+
+CudaPtr allocate_cuda(size_t nbytes) {
+  void* ptr = nullptr;
+  const cudaError_t err = cudaMalloc(&ptr, nbytes);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed";
+  return CudaPtr(ptr);
+}
+
+bool is_cuda_available() {
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_disable();
+#endif
+  int device_count = 0;
+  const cudaError_t err = cudaGetDeviceCount(&device_count);
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_enable();
+#endif
+  return err == cudaSuccess && device_count > 0;
+}
+
+std::vector<float> copy_cuda_to_host(const void* device_ptr, size_t numel) {
+  std::vector<float> host(numel);
+  const cudaError_t err = cudaMemcpy(
+      host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed";
+  return host;
+}
+
+void copy_host_to_cuda(const std::vector<float>& host, void* device_ptr) {
+  const cudaError_t err = cudaMemcpy(
+      device_ptr,
+      host.data(),
+      host.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed";
+}
+
+class CudaDeviceCopyOpTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr)
+        << "Linking cuda_backend should auto-register the CUDA allocator";
+  }
+
+  void SetUp() override {
+    if (!is_cuda_available()) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests";
+    }
+  }
+
+  Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_h2d_copy_outf(context_, self, out);
+  }
+
+  Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_d2h_copy_outf(context_, self, out);
+  }
+
+  KernelRuntimeContext context_;
+};
+
+} // namespace
+
+TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) {
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto device_data = allocate_cuda(src_data.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+
+  int32_t sizes[] = {static_cast<int32_t>(src_data.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(context_.failure_state(), Error::Ok);
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data);
+}
+
+TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) {
+  const std::vector<float> expected = {5.0f, 6.0f, 7.0f, 8.0f};
+  auto device_data = allocate_cuda(expected.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+  copy_host_to_cuda(expected, device_data.get());
+
+  std::vector<float> dst_data(expected.size(), 0.0f);
+  int32_t sizes[] = {static_cast<int32_t>(expected.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_d2h_copy_out(src, dst);
+
+  EXPECT_EQ(context_.failure_state(), Error::Ok);
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(dst_data, expected);
+}
diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp
new file mode 100644
index 00000000000..5e1a51a83be
--- /dev/null
+++ b/kernels/portable/cpu/op__device_copy.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops.
+ *
+ * These ops transfer tensor data between CPU and device memory using
+ * the DeviceAllocator interface. The device type is inferred from the
+ * tensor metadata (out.device_type() for H2D, self.device_type() for D2H),
+ * which was set during AOT serialization by PropagateDevicePass.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using DeviceAllocator = executorch::runtime::DeviceAllocator;
+using Error = executorch::runtime::Error;
+
+/**
+ * Copies tensor data from host (CPU) memory to device memory.
+ *
+ * self: source tensor on CPU
+ * out:  destination tensor on device (memory-planned by runtime)
+ *
+ * The device type and index are inferred from out's TensorImpl metadata.
+ */
+Tensor&
+_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = out.unsafeGetTensorImpl()->device_type();
+  auto device_index = out.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      self.unsafeGetTensorImpl()->device_type() ==
+          executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: source tensor must be on CPU, got device_type=%d",
+      static_cast<int>(self.unsafeGetTensorImpl()->device_type()));
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: destination tensor must be on a non-CPU device");
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator =
+      executorch::runtime::get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_h2d_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_host_to_device(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_h2d_copy: copy_host_to_device failed");
+
+  return out;
+}
+
+/**
+ * Copies tensor data from device memory to host (CPU) memory.
+ *
+ * self: source tensor on device
+ * out:  destination tensor on CPU (memory-planned by runtime)
+ *
+ * The device type and index are inferred from self's TensorImpl metadata.
+ */
+Tensor&
+_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = self.unsafeGetTensorImpl()->device_type();
+  auto device_index = self.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: source tensor must be on a non-CPU device");
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.unsafeGetTensorImpl()->device_type() ==
+          executorch::runtime::etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: destination tensor must be on CPU, got device_type=%d",
+      static_cast<int>(out.unsafeGetTensorImpl()->device_type()));
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator =
+      executorch::runtime::get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_d2h_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_device_to_host(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_d2h_copy: copy_device_to_host failed");
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 620d97d050f..ecf62ee3606 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -1045,6 +1045,16 @@
     - arg_meta: null
       kernel_name: torch::executor::zeros_out
 
+- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_h2d_copy_out
+
+- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_d2h_copy_out
+
 - func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op__device_copy_test.cpp b/kernels/test/op__device_copy_test.cpp
new file mode 100644
index 00000000000..d345642bd37
--- /dev/null
+++ b/kernels/test/op__device_copy_test.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels.
+ *
+ * Uses a MockDeviceAllocator to verify that the kernels correctly call
+ * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface,
+ * and that device type is inferred from tensor metadata.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
+
+namespace {
+
+class MockDeviceAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignment = kDefaultAlignment) override {
+    return Error::NotSupported;
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {}
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    h2d_call_count_++;
+    last_h2d_nbytes_ = nbytes;
+    last_h2d_device_index_ = index;
+    // Actually copy so we can verify data
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    d2h_call_count_++;
+    last_d2h_nbytes_ = nbytes;
+    last_d2h_device_index_ = index;
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int h2d_call_count_ = 0;
+  int d2h_call_count_ = 0;
+  size_t last_h2d_nbytes_ = 0;
+  size_t last_d2h_nbytes_ = 0;
+  DeviceIndex last_h2d_device_index_ = -1;
+  DeviceIndex last_d2h_device_index_ = -1;
+};
+
+} // namespace
+
+static MockDeviceAllocator g_mock_cuda;
+
+class OpDeviceCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_h2d_copy_outf(context_, self, out);
+  }
+
+  Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::et_copy::_d2h_copy_outf(context_, self, out);
+  }
+
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    if (get_device_allocator(DeviceType::CUDA) == nullptr) {
+      register_device_allocator(&g_mock_cuda);
+    }
+  }
+
+  void SetUp() override {
+    OperatorTest::SetUp();
+    g_mock_cuda.h2d_call_count_ = 0;
+    g_mock_cuda.d2h_call_count_ = 0;
+    g_mock_cuda.last_h2d_nbytes_ = 0;
+    g_mock_cuda.last_d2h_nbytes_ = 0;
+    g_mock_cuda.last_h2d_device_index_ = -1;
+    g_mock_cuda.last_d2h_device_index_ = -1;
+  }
+};
+
+TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) {
+  // Set up a CPU source tensor with known data.
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CUDA destination tensor (simulated with host memory).
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_h2d_copy_out(src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0);
+
+  // Verify data was copied (mock does a real memcpy).
+  EXPECT_EQ(dst_data[0], 1.0f);
+  EXPECT_EQ(dst_data[1], 2.0f);
+  EXPECT_EQ(dst_data[2], 3.0f);
+  EXPECT_EQ(dst_data[3], 4.0f);
+
+  // Verify return value is the out tensor.
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) {
+  // Set up a CUDA source tensor with known data.
+  float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CPU destination tensor.
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  Tensor& result = op_d2h_copy_out(src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0);
+
+  // Verify data was copied.
+  EXPECT_EQ(dst_data[0], 5.0f);
+  EXPECT_EQ(dst_data[1], 6.0f);
+  EXPECT_EQ(dst_data[2], 7.0f);
+  EXPECT_EQ(dst_data[3], 8.0f);
+
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) {
+  // Verify device_index is correctly forwarded to the allocator.
+  float src_data[] = {1.0f};
+  float dst_data[] = {0.0f};
+  int32_t sizes[] = {1};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Device index = 1 (e.g., cuda:1)
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      1);
+  Tensor dst(&dst_impl);
+
+  op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) {
+  // Test with a 2D tensor [2, 3].
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+  int32_t sizes[] = {2, 3};
+  uint8_t dim_order[] = {0, 1};
+  int32_t strides[] = {3, 1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  op_h2d_copy_out(src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float));
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(dst_data[i], src_data[i]);
+  }
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index bc51e336cb8..5212d691c5b 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -1,14 +1,14 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "op_test")
 
-def _common_op_test(name, kernels):
+def _common_op_test(name, kernels, deps = []):
     """
     Defines test targets in format of <kernel>_op_<op-name>_test
     For ATen kernel testing, let's use portable functions.yaml for tested ops.
     """
     for kernel in kernels:
-        deps = [":function_header_wrapper_{}".format(kernel)]
-        op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps)
+        op_deps = [":function_header_wrapper_{}".format(kernel)] + deps
+        op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = op_deps)
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -177,6 +177,14 @@ def define_common_targets():
     _common_op_test("op__clone_dim_order_test", ["aten", "portable"])
     _common_op_test("op__conj_physical_test", ["aten", "portable"])
     _common_op_test("op__adaptive_avg_pool2d_test", ["aten", "portable"])
+    _common_op_test(
+        "op__device_copy_test",
+        ["portable"],
+        deps = [
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
     _common_op_test("op_abs_test", ["aten", "portable"])
     _common_op_test("op_acos_test", ["aten", "portable"])
     _common_op_test("op_acosh_test", ["aten", "portable"])
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 5ffa7b65a36..318996784a1 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -535,6 +535,7 @@ def get_portable_lib_deps():
         "//executorch/kernels/portable/cpu:vec_ops",
         "//executorch/kernels/portable/cpu/pattern:all_deps",
         "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/runtime/core:device_allocator",
     ]
 
 def get_optimized_lib_deps():
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index cc2a0f78c75..479f3913f8f 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1405,6 +1405,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op__device_copy",
+        deps = [
+            "//executorch/runtime/core:device_allocator",
+        ],
+    ),
 )
 
 # Operators that are not listed in `functions.yaml` (i.e., operators listed in