diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index b68043f7feb..a54c47e979d 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -42,3 +42,27 @@ def define_common_targets(): cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") + + cpp_unittest( + name = "test_op__device_copy", + srcs = ["test_op__device_copy.cpp"], + deps = [ + "//executorch/backends/cuda/runtime:cuda_backend", + "//executorch/kernels/portable:generated_lib", + "//executorch/kernels/portable:generated_lib_headers", + "//executorch/kernels/portable/cpu:op__device_copy", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/portable_type:portable_type", + "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + ) diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp new file mode 100644 index 00000000000..4e5c5a099b7 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \ + defined(__SANITIZE_ADDRESS__) +#include +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1 +#else +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0 +#endif + +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::TensorShapeDynamism; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +struct CudaDeleter { + void operator()(void* ptr) const { + if (ptr != nullptr) { + cudaFree(ptr); + } + } +}; + +using CudaPtr = std::unique_ptr; + +CudaPtr allocate_cuda(size_t nbytes) { + void* ptr = nullptr; + const cudaError_t err = cudaMalloc(&ptr, nbytes); + EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed"; + return CudaPtr(ptr); +} + +bool is_cuda_available() { +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_disable(); +#endif + int device_count = 0; + const cudaError_t err = cudaGetDeviceCount(&device_count); +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_enable(); +#endif + return err == cudaSuccess && device_count > 0; +} + +std::vector copy_cuda_to_host(const void* device_ptr, size_t numel) { + std::vector host(numel); + const cudaError_t err = cudaMemcpy( + host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed"; + return host; +} + +void copy_host_to_cuda(const std::vector& host, void* device_ptr) { + const cudaError_t err = cudaMemcpy( + device_ptr, + host.data(), + host.size() * sizeof(float), + cudaMemcpyHostToDevice); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed"; +} + +class CudaDeviceCopyOpTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr) + << "Linking cuda_backend should auto-register the CUDA allocator"; + } + + void SetUp() override { + if (!is_cuda_available()) { + GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests"; + } + } + + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + KernelRuntimeContext context_; +}; + +} // namespace + +TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) { + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto device_data = allocate_cuda(src_data.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + + int32_t sizes[] = {static_cast(src_data.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data); +} + +TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) { + const std::vector expected = {5.0f, 6.0f, 7.0f, 8.0f}; + auto device_data = allocate_cuda(expected.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + copy_host_to_cuda(expected, device_data.get()); + + std::vector dst_data(expected.size(), 0.0f); + int32_t sizes[] = {static_cast(expected.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(dst_data, expected); +} diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp new file mode 100644 index 00000000000..5e1a51a83be --- /dev/null +++ b/kernels/portable/cpu/op__device_copy.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops. + * + * These ops transfer tensor data between CPU and device memory using + * the DeviceAllocator interface. The device type is inferred from the + * tensor metadata (out.device_type() for H2D, self.device_type() for D2H), + * which was set during AOT serialization by PropagateDevicePass. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; +using DeviceAllocator = executorch::runtime::DeviceAllocator; +using Error = executorch::runtime::Error; + +/** + * Copies tensor data from host (CPU) memory to device memory. + * + * self: source tensor on CPU + * out: destination tensor on device (memory-planned by runtime) + * + * The device type and index are inferred from out's TensorImpl metadata. + */ +Tensor& +_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = out.unsafeGetTensorImpl()->device_type(); + auto device_index = out.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: source tensor must be on CPU, got device_type=%d", + static_cast(self.unsafeGetTensorImpl()->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: destination tensor must be on a non-CPU device"); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_h2d_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_host_to_device( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_h2d_copy: copy_host_to_device failed"); + + return out; +} + +/** + * Copies tensor data from device memory to host (CPU) memory. + * + * self: source tensor on device + * out: destination tensor on CPU (memory-planned by runtime) + * + * The device type and index are inferred from self's TensorImpl metadata. + */ +Tensor& +_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = self.unsafeGetTensorImpl()->device_type(); + auto device_index = self.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: source tensor must be on a non-CPU device"); + + ET_KERNEL_CHECK_MSG( + ctx, + out.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: destination tensor must be on CPU, got device_type=%d", + static_cast(out.unsafeGetTensorImpl()->device_type())); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_d2h_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_device_to_host( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_d2h_copy: copy_device_to_host failed"); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 620d97d050f..ecf62ee3606 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -1045,6 +1045,16 @@ - arg_meta: null kernel_name: torch::executor::zeros_out +- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_h2d_copy_out + +- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_d2h_copy_out + - func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/kernels/test/op__device_copy_test.cpp b/kernels/test/op__device_copy_test.cpp new file mode 100644 index 00000000000..d345642bd37 --- /dev/null +++ b/kernels/test/op__device_copy_test.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels. + * + * Uses a MockDeviceAllocator to verify that the kernels correctly call + * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface, + * and that device type is inferred from tensor metadata. + */ + +#include + +#include // Declares the operator +#include +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism; + +namespace { + +class MockDeviceAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = kDefaultAlignment) override { + return Error::NotSupported; + } + + void deallocate(void* ptr, DeviceIndex index) override {} + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + h2d_call_count_++; + last_h2d_nbytes_ = nbytes; + last_h2d_device_index_ = index; + // Actually copy so we can verify data + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + d2h_call_count_++; + last_d2h_nbytes_ = nbytes; + last_d2h_device_index_ = index; + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int h2d_call_count_ = 0; + int d2h_call_count_ = 0; + size_t last_h2d_nbytes_ = 0; + size_t last_d2h_nbytes_ = 0; + DeviceIndex last_h2d_device_index_ = -1; + DeviceIndex last_d2h_device_index_ = -1; +}; + +} // namespace + +static MockDeviceAllocator g_mock_cuda; + +class OpDeviceCopyTest : public OperatorTest { + protected: + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + if (get_device_allocator(DeviceType::CUDA) == nullptr) { + register_device_allocator(&g_mock_cuda); + } + } + + void SetUp() override { + OperatorTest::SetUp(); + g_mock_cuda.h2d_call_count_ = 0; + g_mock_cuda.d2h_call_count_ = 0; + g_mock_cuda.last_h2d_nbytes_ = 0; + g_mock_cuda.last_d2h_nbytes_ = 0; + g_mock_cuda.last_h2d_device_index_ = -1; + g_mock_cuda.last_d2h_device_index_ = -1; + } +}; + +TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) { + // Set up a CPU source tensor with known data. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Set up a CUDA destination tensor (simulated with host memory). + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0); + + // Verify data was copied (mock does a real memcpy). + EXPECT_EQ(dst_data[0], 1.0f); + EXPECT_EQ(dst_data[1], 2.0f); + EXPECT_EQ(dst_data[2], 3.0f); + EXPECT_EQ(dst_data[3], 4.0f); + + // Verify return value is the out tensor. + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) { + // Set up a CUDA source tensor with known data. + float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + // Set up a CPU destination tensor. + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0); + + // Verify data was copied. + EXPECT_EQ(dst_data[0], 5.0f); + EXPECT_EQ(dst_data[1], 6.0f); + EXPECT_EQ(dst_data[2], 7.0f); + EXPECT_EQ(dst_data[3], 8.0f); + + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) { + // Verify device_index is correctly forwarded to the allocator. + float src_data[] = {1.0f}; + float dst_data[] = {0.0f}; + int32_t sizes[] = {1}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Device index = 1 (e.g., cuda:1) + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 1); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1); +} + +TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) { + // Test with a 2D tensor [2, 3]. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + int32_t sizes[] = {2, 3}; + uint8_t dim_order[] = {0, 1}; + int32_t strides[] = {3, 1}; + + TensorImpl src_impl( + ScalarType::Float, + 2, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 2, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float)); + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(dst_data[i], src_data[i]); + } +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index bc51e336cb8..5212d691c5b 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -1,14 +1,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "op_test") -def _common_op_test(name, kernels): +def _common_op_test(name, kernels, deps = []): """ Defines test targets in format of _op__test For ATen kernel testing, let's use portable functions.yaml for tested ops. """ for kernel in kernels: - deps = [":function_header_wrapper_{}".format(kernel)] - op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps) + op_deps = [":function_header_wrapper_{}".format(kernel)] + deps + op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = op_deps) def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -177,6 +177,14 @@ def define_common_targets(): _common_op_test("op__clone_dim_order_test", ["aten", "portable"]) _common_op_test("op__conj_physical_test", ["aten", "portable"]) _common_op_test("op__adaptive_avg_pool2d_test", ["aten", "portable"]) + _common_op_test( + "op__device_copy_test", + ["portable"], + deps = [ + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/platform:platform", + ], + ) _common_op_test("op_abs_test", ["aten", "portable"]) _common_op_test("op_acos_test", ["aten", "portable"]) _common_op_test("op_acosh_test", ["aten", "portable"]) diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl index 5ffa7b65a36..318996784a1 100644 --- a/shim_et/xplat/executorch/codegen/codegen.bzl +++ b/shim_et/xplat/executorch/codegen/codegen.bzl @@ -535,6 +535,7 @@ def get_portable_lib_deps(): "//executorch/kernels/portable/cpu:vec_ops", "//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/runtime/core:device_allocator", ] def get_optimized_lib_deps(): diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index cc2a0f78c75..479f3913f8f 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -1405,6 +1405,12 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op__device_copy", + deps = [ + "//executorch/runtime/core:device_allocator", + ], + ), ) # Operators that are not listed in `functions.yaml` (i.e., operators listed in