From 3e88726c8797b78579e7f16b19846a198c9ccb8d Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Thu, 16 Jun 2022 21:42:39 +0800
Subject: [PATCH] [CustomKernel] add custom kernel c api (#42986)

* [CustomKernel] add custom kernel c api

* update

* update

* fix unable to export capi

Co-authored-by: ronny1996 <524019753@qq.com>
---
 paddle/fluid/pybind/CMakeLists.txt            |   6 +-
 paddle/fluid/pybind/imperative.cc             |  11 +
 paddle/fluid/pybind/pybind.cc                 |   4 +
 paddle/phi/CMakeLists.txt                     |   4 +
 paddle/phi/backends/custom/custom_device.cc   |   6 +-
 paddle/phi/backends/device_ext.h              |  27 +
 paddle/phi/capi/CMakeLists.txt                |  13 +
 paddle/phi/capi/all.cc                        |  19 +
 paddle/phi/capi/all.h                         |  30 +
 paddle/phi/capi/capi.h                        |  30 +
 paddle/phi/capi/include/c_data_type.h         |  56 ++
 paddle/phi/capi/include/c_device_context.h    |  42 +
 paddle/phi/capi/include/c_int_array.h         |  34 +
 paddle/phi/capi/include/c_kernel_context.h    |  93 ++
 paddle/phi/capi/include/c_kernel_factory.h    |  72 ++
 paddle/phi/capi/include/c_kernel_registry.h   |  77 ++
 paddle/phi/capi/include/c_place.h             |  34 +
 paddle/phi/capi/include/c_scalar.h            |  54 ++
 paddle/phi/capi/include/c_tensor.h            |  88 ++
 paddle/phi/capi/include/common.h              |  41 +
 paddle/phi/capi/include/data_type.h           |  64 ++
 paddle/phi/capi/include/kernel_registry.h     | 338 ++++++++
 paddle/phi/capi/include/kernel_utils.h        | 812 ++++++++++++++++++
 paddle/phi/capi/include/type_utils.h          | 123 +++
 paddle/phi/capi/include/wrapper_base.h        | 497 +++++++++++
 paddle/phi/capi/lib/CMakeLists.txt            |  44 +
 paddle/phi/capi/lib/c_data_type.cc            |  49 ++
 paddle/phi/capi/lib/c_device_context.cc       |  77 ++
 paddle/phi/capi/lib/c_int_array.cc            |  34 +
 paddle/phi/capi/lib/c_kernel_context.cc       | 223 +++++
 paddle/phi/capi/lib/c_kernel_factory.cc       | 150 ++++
 paddle/phi/capi/lib/c_kernel_registry.cc      | 174 ++++
 paddle/phi/capi/lib/c_place.cc                |  30 +
 paddle/phi/capi/lib/c_scalar.cc               |  81 ++
 paddle/phi/capi/lib/c_tensor.cc               | 302 +++++++
 paddle/phi/kernels/funcs/math_function.cc     |   6 +
 .../custom_kernel/custom_kernel_dot_c.cc      |  49 ++
 .../custom_kernel_dot_c_setup.py              |  81 ++
 .../custom_kernel/test_custom_kernel_dot.py   |  36 +
 python/setup.py.in                            |   2 +
 40 files changed, 3910 insertions(+), 3 deletions(-)
 create mode 100644 paddle/phi/capi/CMakeLists.txt
 create mode 100644 paddle/phi/capi/all.cc
 create mode 100644 paddle/phi/capi/all.h
 create mode 100644 paddle/phi/capi/capi.h
 create mode 100644 paddle/phi/capi/include/c_data_type.h
 create mode 100644 paddle/phi/capi/include/c_device_context.h
 create mode 100644 paddle/phi/capi/include/c_int_array.h
 create mode 100644 paddle/phi/capi/include/c_kernel_context.h
 create mode 100644 paddle/phi/capi/include/c_kernel_factory.h
 create mode 100644 paddle/phi/capi/include/c_kernel_registry.h
 create mode 100644 paddle/phi/capi/include/c_place.h
 create mode 100644 paddle/phi/capi/include/c_scalar.h
 create mode 100644 paddle/phi/capi/include/c_tensor.h
 create mode 100644 paddle/phi/capi/include/common.h
 create mode 100644 paddle/phi/capi/include/data_type.h
 create mode 100644 paddle/phi/capi/include/kernel_registry.h
 create mode 100644 paddle/phi/capi/include/kernel_utils.h
 create mode 100644 paddle/phi/capi/include/type_utils.h
 create mode 100644 paddle/phi/capi/include/wrapper_base.h
 create mode 100644 paddle/phi/capi/lib/CMakeLists.txt
 create mode 100644 paddle/phi/capi/lib/c_data_type.cc
 create mode 100644 paddle/phi/capi/lib/c_device_context.cc
 create mode 100644 paddle/phi/capi/lib/c_int_array.cc
 create mode 100644 paddle/phi/capi/lib/c_kernel_context.cc
 create mode 100644 paddle/phi/capi/lib/c_kernel_factory.cc
 create mode 100644 paddle/phi/capi/lib/c_kernel_registry.cc
 create mode 100644 paddle/phi/capi/lib/c_place.cc
 create mode 100644 paddle/phi/capi/lib/c_scalar.cc
 create mode 100644 paddle/phi/capi/lib/c_tensor.cc
 create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc
 create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 20460c78d2867..e4d4bf1a1c441 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -123,6 +123,10 @@ set(PYBIND_SRCS
     communication.cc
     cuda_streams_py.cc)
 
+if(WITH_CUSTOM_DEVICE)
+  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
+endif()
+
 if(NOT ON_INFER)
   set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if(WITH_NCCL)
@@ -491,7 +495,7 @@ if(WITH_PYTHON)
   cc_library(
     paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
-    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3de6c64617ddd..0247cbe0e2b27 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1734,6 +1734,17 @@ void BindImperative(py::module *m_ptr) {
             return new_var;
           },
           py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CustomPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cba7d03623516..b81f494f1a7df 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -147,6 +147,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 
 #ifdef PADDLE_WITH_IPU
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index e20db18ea3f53..9715fd770422a 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -21,6 +21,10 @@ add_subdirectory(ops)
 add_subdirectory(tools)
 # phi tests
 add_subdirectory(tests)
+# phi capi
+if(WITH_CUSTOM_DEVICE)
+  add_subdirectory(capi)
+endif()
 
 # make an unity target for compile deps
 set(PHI_DEPS
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index df757b286a6b1..541acd9ecafd0 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -348,7 +348,8 @@ class CustomDevice : public DeviceInterface {
       }
     } else {
       if (!pimpl_->memory_copy_p2p) {
-        std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+        std::unique_ptr<uint8_t> tmp(
+            reinterpret_cast<uint8_t*>(new uint8_t[size]));
         MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
         MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
       } else {
@@ -440,7 +441,8 @@ class CustomDevice : public DeviceInterface {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->device_memory_set(device, ptr, value, size));
     } else {
-      std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+      std::unique_ptr<uint8_t> tmp(
+          reinterpret_cast<uint8_t*>(new uint8_t[size]));
       memset(tmp.get(), value, size);
       MemoryCopyH2D(dev_id, ptr, tmp.get(), size);
     }
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index ff58f4f35fd32..77c9ee61858c1 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -25,6 +25,33 @@ extern "C" {
 #define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
 #define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
 
+typedef enum {
+  UNDEFINED = 0,
+  BOOL,
+  UINT8,
+  UINT16,
+  UINT32,
+  UINT64,
+  INT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  FLOAT32,
+  FLOAT64,
+  BFLOAT16,
+} C_DataType;
+
+typedef enum {
+  ANY = 0,
+  NHWC,
+  NCHW,
+  NCDHW,
+  NDHWC,
+  NUM_DATA_LAYOUTS,
+  ALL_LAYOUT = ANY,
+} C_DataLayout;
+
 typedef enum {
   C_SUCCESS = 0,    // success
   C_WARNING,        // results may not meet expectation (such as an asynchronous
diff --git a/paddle/phi/capi/CMakeLists.txt b/paddle/phi/capi/CMakeLists.txt
new file mode 100644
index 0000000000000..c00c38cfa3a8a
--- /dev/null
+++ b/paddle/phi/capi/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_subdirectory(lib)
+cc_library(
+  phi_capi
+  SRCS all.cc
+  DEPS phi_c_data_type
+       phi_c_device_context
+       phi_c_int_array
+       phi_c_kernel_context
+       phi_c_kernel_factory
+       phi_c_kernel_registry
+       phi_c_place
+       phi_c_scalar
+       phi_c_tensor)
diff --git a/paddle/phi/capi/all.cc b/paddle/phi/capi/all.cc
new file mode 100644
index 0000000000000..3d9c9315b3136
--- /dev/null
+++ b/paddle/phi/capi/all.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/capi/all.h"
+
+namespace paddle {
+namespace capi {}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/phi/capi/all.h b/paddle/phi/capi/all.h
new file mode 100644
index 0000000000000..5bd31cafdf977
--- /dev/null
+++ b/paddle/phi/capi/all.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+#include "paddle/phi/capi/include/data_type.h"
+#include "paddle/phi/capi/include/kernel_registry.h"
+
+#endif
diff --git a/paddle/phi/capi/capi.h b/paddle/phi/capi/capi.h
new file mode 100644
index 0000000000000..f8e5a90ddf883
--- /dev/null
+++ b/paddle/phi/capi/capi.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/common.h"
+
+PD_DECLARE_CAPI(data_type);
+PD_DECLARE_CAPI(device_context);
+PD_DECLARE_CAPI(int_array);
+PD_DECLARE_CAPI(kernel_context);
+PD_DECLARE_CAPI(kernel_factory);
+PD_DECLARE_CAPI(kernel_registry);
+PD_DECLARE_CAPI(place);
+PD_DECLARE_CAPI(scalar);
+PD_DECLARE_CAPI(tensor);
+
+#endif
diff --git a/paddle/phi/capi/include/c_data_type.h b/paddle/phi/capi/include/c_data_type.h
new file mode 100644
index 0000000000000..e33d04705206c
--- /dev/null
+++ b/paddle/phi/capi/include/c_data_type.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <cstdint>
+
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef C_Status PD_Status;
+
+typedef C_DataType PD_DataType;
+
+typedef C_DataLayout PD_DataLayout;
+
+typedef struct {
+  size_t size;
+  void *data;
+} PD_List;
+
+void PD_DeletePointerList(PD_List list);
+
+void PD_DeleteUInt8List(PD_List list);
+
+void PD_DeleteInt64List(PD_List list);
+
+void PD_DeleteInt32List(PD_List list);
+
+void PD_DeleteFloat64List(PD_List list);
+
+void PD_DeleteFloat32List(PD_List list);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_device_context.h b/paddle/phi/capi/include/c_device_context.h
new file mode 100644
index 0000000000000..68621d58ad9d5
--- /dev/null
+++ b/paddle/phi/capi/include/c_device_context.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_DeviceContext PD_DeviceContext;
+
+typedef C_Stream PD_Stream;
+
+PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext *ctx,
+                                    PD_Status *status);
+
+void *PD_DeviceContextAllocateTensor(const PD_DeviceContext *ctx,
+                                     PD_Tensor *tensor,
+                                     size_t size,
+                                     PD_DataType dtype,
+                                     PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_int_array.h b/paddle/phi/capi/include/c_int_array.h
new file mode 100644
index 0000000000000..dbc13b3abea4f
--- /dev/null
+++ b/paddle/phi/capi/include/c_int_array.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_IntArray PD_IntArray;
+
+PD_List PD_IntArrayGetDataPointer(PD_IntArray *int_array);
+
+size_t PD_IntArrayGetElementCount(PD_IntArray *int_array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_context.h b/paddle/phi/capi/include/c_kernel_context.h
new file mode 100644
index 0000000000000..c06cb3cd30086
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_context.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_KernelContext PD_KernelContext;
+
+/**
+ * KernelContext
+ */
+
+PD_DeviceContext *PD_KernelContextGetDeviceContext(PD_KernelContext *ctx);
+
+PD_Tensor *PD_KernelContextInputAt(PD_KernelContext *ctx, size_t index);
+
+// PD_Tensor *PD_KernelContextOptionalInputAt(PD_KernelContext *ctx, size_t
+// index);
+
+PD_List PD_KernelContextMultiInputAt(PD_KernelContext *ctx, size_t index);
+
+PD_Tensor *PD_KernelContextOutputAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextMultiOutputAt(PD_KernelContext *ctx, size_t index);
+
+/**
+ * Attribute
+ */
+
+bool PD_KernelContextBoolAttrAt(PD_KernelContext *ctx, size_t index);
+
+int32_t PD_KernelContextInt32AttrAt(PD_KernelContext *ctx, size_t index);
+
+int64_t PD_KernelContextInt64AttrAt(PD_KernelContext *ctx, size_t index);
+
+float PD_KernelContextFloatAttrAt(PD_KernelContext *ctx, size_t index);
+
+double PD_KernelContextDoubleAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_Scalar *PD_KernelContextScalarAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_IntArray *PD_KernelContextIntArrayAttrAt(PD_KernelContext *ctx,
+                                            size_t index);
+
+PD_DataType PD_KernelContextDataTypeAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext *ctx,
+                                               size_t index);
+
+char *PD_KernelContextStringAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListBoolAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListInt32AttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListInt64AttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListFloatAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListDoubleAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListStringAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_Place *PD_KernelContextPlaceAttrAt(PD_KernelContext *ctx, size_t index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_factory.h b/paddle/phi/capi/include/c_kernel_factory.h
new file mode 100644
index 0000000000000..f84f16ba52011
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_factory.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_KernelKey PD_KernelKey;
+
+typedef struct PD_Kernel PD_Kernel;
+
+typedef struct PD_KernelArgsDef PD_KernelArgsDef;
+
+typedef struct PD_TensorArgDef PD_TensorArgDef;
+
+/**
+ * TensorArgDef
+ */
+
+void PD_TensorArgDefSetDataLayout(PD_TensorArgDef *def,
+                                  PD_DataLayout layout,
+                                  PD_Status *status);
+
+void PD_TensorArgDefSetDataType(PD_TensorArgDef *def,
+                                PD_DataType dtype,
+                                PD_Status *status);
+
+/**
+ * KernelArgsDef
+ */
+
+PD_List PD_KernelArgsDefGetInputArgDefs(PD_KernelArgsDef *def,
+                                        PD_Status *status);
+
+PD_List PD_KernelArgsDefGetOutputArgDefs(PD_KernelArgsDef *def,
+                                         PD_Status *status);
+
+/**
+ * KernelKey
+ */
+
+PD_DataLayout PD_KernelKeyGetLayout(PD_KernelKey *key, PD_Status *status);
+
+PD_DataType PD_KernelKeyGetDataType(PD_KernelKey *key, PD_Status *status);
+
+/**
+ * Kernel
+ */
+
+PD_KernelArgsDef *PD_KernelGetArgsDef(PD_Kernel *kernel, PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_registry.h b/paddle/phi/capi/include/c_kernel_registry.h
new file mode 100644
index 0000000000000..04990be436be9
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_registry.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <vector>
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  PD_ARG_TYPE_CONTEXT = 0,
+  PD_ARG_TYPE_TENSOR,
+  PD_ARG_TYPE_BOOL,
+  PD_ARG_TYPE_BFLOAT16,
+  PD_ARG_TYPE_FLOAT16,
+  PD_ARG_TYPE_FLOAT32,
+  PD_ARG_TYPE_FLOAT64,
+  PD_ARG_TYPE_INT32,
+  PD_ARG_TYPE_INT64,
+  PD_ARG_TYPE_STRING,
+  PD_ARG_TYPE_SCALAR,
+  PD_ARG_TYPE_INT_ARRAY,
+  PD_ARG_TYPE_DATA_TYPE,
+  PD_ARG_TYPE_DATA_LAYOUT,
+  PD_ARG_TYPE_PLACE,
+  PD_ARG_TYPE_LIST_BOOL,
+  PD_ARG_TYPE_LIST_INT32,
+  PD_ARG_TYPE_LIST_INT64,
+  PD_ARG_TYPE_LIST_BFLOAT16,
+  PD_ARG_TYPE_LIST_FLOAT16,
+  PD_ARG_TYPE_LIST_FLOAT32,
+  PD_ARG_TYPE_LIST_FLOAT64,
+  PD_ARG_TYPE_LIST_STRING,
+  PD_ARG_TYPE_LIST_SCALAR,
+  PD_ARG_TYPE_OPTIONAL_TENSOR,
+  PD_ARG_TYPE_LIST_TENSOR,
+  PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR,
+} PD_KernelArgumentType;
+
+void PD_RegisterPhiKernel(const char *kernel_name_cstr,
+                          const char *backend_cstr,
+                          PD_DataType pd_dtype,
+                          PD_DataLayout pd_layout,
+                          size_t in_nargs,
+                          PD_KernelArgumentType *in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType *attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType *out_args_type,
+                          void (*args_def_fn)(const PD_KernelKey *,
+                                              PD_Kernel *),
+                          void (*fn)(PD_KernelContext *),
+                          void *variadic_kernel_fn);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_place.h b/paddle/phi/capi/include/c_place.h
new file mode 100644
index 0000000000000..bbdc45cbe8d46
--- /dev/null
+++ b/paddle/phi/capi/include/c_place.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Place PD_Place;
+
+bool PD_PlaceIsHost(PD_Place *place);
+
+int8_t PD_PlaceGetDeviceId(PD_Place *place);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_scalar.h b/paddle/phi/capi/include/c_scalar.h
new file mode 100644
index 0000000000000..3ea3c3fc12c65
--- /dev/null
+++ b/paddle/phi/capi/include/c_scalar.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Scalar PD_Scalar;
+
+bool PD_ScalarGetBoolData(PD_Scalar *scalar);
+
+int8_t PD_ScalarGetInt8Data(PD_Scalar *scalar);
+
+int16_t PD_ScalarGetInt16Data(PD_Scalar *scalar);
+
+int32_t PD_ScalarGetInt32Data(PD_Scalar *scalar);
+
+int64_t PD_ScalarGetInt64Data(PD_Scalar *scalar);
+
+uint8_t PD_ScalarGetUInt8Data(PD_Scalar *scalar);
+
+uint16_t PD_ScalarGetUInt16Data(PD_Scalar *scalar);
+
+uint32_t PD_ScalarGetUInt32Data(PD_Scalar *scalar);
+
+uint64_t PD_ScalarGetUInt64Data(PD_Scalar *scalar);
+
+float PD_ScalarGetFloat32Data(PD_Scalar *scalar);
+
+double PD_ScalarGetFloat64Data(PD_Scalar *scalar);
+
+PD_DataType PD_ScalarGetDataType(PD_Scalar *scalar);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h
new file mode 100644
index 0000000000000..494346713cf53
--- /dev/null
+++ b/paddle/phi/capi/include/c_tensor.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Tensor PD_Tensor;
+
+PD_DataType PD_TensorGetDataType(const PD_Tensor *tensor, PD_Status *status);
+
+PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor *tensor,
+                                     PD_Status *status);
+
+int64_t PD_TensorGetByteSize(const PD_Tensor *tensor, PD_Status *status);
+
+void *PD_TensorGetDataPointer(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetElementCount(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetNumDims(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetDim(const PD_Tensor *tensor,
+                        size_t index,
+                        PD_Status *status);
+
+void PD_TensorGetLoD(const PD_Tensor *tensor,
+                     PD_List *data,
+                     PD_List *offset,
+                     PD_Status *status);
+
+bool PD_TensorIsInitialized(const PD_Tensor *tensor, PD_Status *status);
+
+bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status);
+
+void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status);
+
+void PD_TensorSetDims(PD_Tensor *tensor,
+                      int64_t ndims,
+                      const int64_t *dims,
+                      PD_Status *status);
+
+void PD_TensorSetDataType(PD_Tensor *tensor,
+                          PD_DataType dtype,
+                          PD_Status *status);
+
+void PD_TensorSetDataLayout(PD_Tensor *tensor,
+                            PD_DataLayout layout,
+                            PD_Status *status);
+
+void PD_TensorResetLoD(PD_Tensor *tensor,
+                       PD_List data,
+                       PD_List offset,
+                       PD_Status *status);
+
+PD_Tensor *PD_NewTensor();
+
+void PD_DeleteTensor(PD_Tensor *tensor);
+
+void PD_TensorShareDataWith(PD_Tensor *dst,
+                            const PD_Tensor *src,
+                            PD_Status *status);
+
+void PD_TensorShareLoDWith(PD_Tensor *dst,
+                           const PD_Tensor *src,
+                           PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/common.h b/paddle/phi/capi/include/common.h
new file mode 100644
index 0000000000000..2d2bc231f479b
--- /dev/null
+++ b/paddle/phi/capi/include/common.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#define PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+
+#define _PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)  \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define PD_DECLARE_CAPI(module_name)                             \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(           \
+      PD_DECLARE_tp_kernel_ns_check_##module_name##_,            \
+      "PD_DECLARE_KERNEL must be called in global namespace.");  \
+  extern int TouchCAPISymbolFor##module_name##_();               \
+  UNUSED static int __declare_capi_symbol_for_##module_name##_ = \
+      TouchCAPISymbolFor##module_name##_()
+
+#define PD_REGISTER_CAPI(module_name)                           \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(          \
+      PD_DECLARE_tp_kernel_ns_check_##module_name##_,           \
+      "PD_DECLARE_KERNEL must be called in global namespace."); \
+  int TouchCAPISymbolFor##module_name##_() { return 0; }
+
+#endif
diff --git a/paddle/phi/capi/include/data_type.h b/paddle/phi/capi/include/data_type.h
new file mode 100644
index 0000000000000..6acbf026e8cb6
--- /dev/null
+++ b/paddle/phi/capi/include/data_type.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+namespace phi {
+
+namespace capi {
+
+#define CPP_TYPE_TO_PD_DTYPE_REGISTER(_)         \
+  _(bool, PD_DataType::BOOL)                     \
+  _(phi::dtype::bfloat16, PD_DataType::BFLOAT16) \
+  _(phi::dtype::float16, PD_DataType::FLOAT16)   \
+  _(float, PD_DataType::FLOAT32)                 \
+  _(double, PD_DataType::FLOAT64)                \
+  _(uint8_t, PD_DataType::UINT8)                 \
+  _(uint16_t, PD_DataType::UINT16)               \
+  _(uint32_t, PD_DataType::UINT32)               \
+  _(uint64_t, PD_DataType::UINT64)               \
+  _(int8_t, PD_DataType::INT8)                   \
+  _(int16_t, PD_DataType::INT16)                 \
+  _(int32_t, PD_DataType::INT32)                 \
+  _(int64_t, PD_DataType::INT64)
+
+template <typename T>
+struct CppTypeToPDType;
+
+#define CPP_TYPE_TO_PD_DTYPE(x, y)                    \
+  template <>                                         \
+  struct CppTypeToPDType<x> {                         \
+    constexpr static PD_DataType Type() { return y; } \
+  };
+
+template <PD_DataType T>
+struct PDTypeToCppType;
+
+#define PD_DTYPE_TO_CPP_TYPE(x, y) \
+  template <>                      \
+  struct PDTypeToCppType<y> {      \
+    using type = x;                \
+  };
+
+CPP_TYPE_TO_PD_DTYPE_REGISTER(CPP_TYPE_TO_PD_DTYPE)
+CPP_TYPE_TO_PD_DTYPE_REGISTER(PD_DTYPE_TO_CPP_TYPE)
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/kernel_registry.h b/paddle/phi/capi/include/kernel_registry.h
new file mode 100644
index 0000000000000..37b045a60658b
--- /dev/null
+++ b/paddle/phi/capi/include/kernel_registry.h
@@ -0,0 +1,338 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/wrapper_base.h"
+
+namespace phi {
+namespace capi {
+
+inline phi::capi::DeviceContext PD_GetDeviceContext(PD_KernelContext *ctx) {
+  return phi::capi::DeviceContext(PD_KernelContextGetDeviceContext(ctx));
+}
+
+inline phi::capi::DenseTensor PD_InputAt(PD_KernelContext *ctx, size_t index) {
+  return phi::capi::DenseTensor(PD_KernelContextInputAt(ctx, index));
+}
+
+inline paddle::optional<phi::capi::DenseTensor> PD_OptionalInputAt(
+    PD_KernelContext *ctx, size_t index) {
+  auto tensor = PD_KernelContextInputAt(ctx, index);
+  return tensor
+             ? paddle::optional<phi::capi::DenseTensor>(phi::capi::DenseTensor(
+                   reinterpret_cast<PD_Tensor *>(tensor)))
+             : paddle::optional<phi::capi::DenseTensor>(paddle::none);
+}
+
+inline std::vector<phi::capi::DenseTensor> PD_MultiInputAt(
+    PD_KernelContext *ctx, size_t index) {
+  std::vector<phi::capi::DenseTensor> ret;
+  auto list = PD_KernelContextMultiInputAt(ctx, index);
+  auto data = reinterpret_cast<PD_Tensor **>(list.data);
+  for (size_t i = 0; i < list.size; ++i) {
+    ret.emplace_back(data[i]);
+  }
+  return ret;
+}
+
+inline phi::capi::DenseTensor PD_OutputAt(PD_KernelContext *ctx, size_t index) {
+  return phi::capi::DenseTensor(PD_KernelContextOutputAt(ctx, index));
+}
+
+inline std::vector<phi::capi::DenseTensor> PD_MultiOutputAt(
+    PD_KernelContext *ctx, size_t index) {
+  std::vector<phi::capi::DenseTensor> ret;
+  auto list = PD_KernelContextMultiOutputAt(ctx, index);
+  auto data = reinterpret_cast<PD_Tensor **>(list.data);
+  for (size_t i = 0; i < list.size; ++i) {
+    ret.emplace_back(data[i]);
+  }
+  return ret;
+}
+
+template <typename T>
+inline std::vector<T *> PD_GetPointerVector(std::vector<T> *vec) {
+  std::vector<T *> ret;
+  for (auto &item : vec) {
+    ret.push_back(&item);
+  }
+  return ret;
+}
+
+template <typename T>
+inline T PD_AttrAt(PD_KernelContext *ctx, size_t index);
+
+template <>
+inline bool PD_AttrAt<bool>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextBoolAttrAt(ctx, index);
+}
+
+template <>
+inline int32_t PD_AttrAt<int32_t>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextInt32AttrAt(ctx, index);
+}
+
+template <>
+inline int64_t PD_AttrAt<int64_t>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextInt64AttrAt(ctx, index);
+}
+
+template <>
+inline float PD_AttrAt<float>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextFloatAttrAt(ctx, index);
+}
+
+template <>
+inline double PD_AttrAt<double>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextDoubleAttrAt(ctx, index);
+}
+
+template <>
+inline std::string PD_AttrAt<std::string>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextStringAttrAt(ctx, index);
+}
+
+template <>
+inline PD_DataType PD_AttrAt<PD_DataType>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextDataTypeAttrAt(ctx, index);
+}
+
+template <>
+inline PD_DataLayout PD_AttrAt<PD_DataLayout>(PD_KernelContext *ctx,
+                                              size_t index) {
+  return PD_KernelContextDataLayoutAttrAt(ctx, index);
+}
+
+template <>
+inline std::vector<int32_t> PD_AttrAt<std::vector<int32_t>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto list = PD_KernelContextListInt32AttrAt(ctx, index);
+  auto data = reinterpret_cast<int32_t *>(list.data);
+  std::vector<int32_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<int64_t> PD_AttrAt<std::vector<int64_t>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto list = PD_KernelContextListInt64AttrAt(ctx, index);
+  auto data = reinterpret_cast<int64_t *>(list.data);
+  std::vector<int64_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<float> PD_AttrAt<std::vector<float>>(PD_KernelContext *ctx,
+                                                        size_t index) {
+  auto list = PD_KernelContextListFloatAttrAt(ctx, index);
+  auto data = reinterpret_cast<float *>(list.data);
+  std::vector<float> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<double> PD_AttrAt<std::vector<double>>(PD_KernelContext *ctx,
+                                                          size_t index) {
+  auto list = PD_KernelContextListDoubleAttrAt(ctx, index);
+  auto data = reinterpret_cast<double *>(list.data);
+  std::vector<double> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline phi::capi::Scalar PD_AttrAt<phi::capi::Scalar>(PD_KernelContext *ctx,
+                                                      size_t index) {
+  auto scalar = PD_KernelContextScalarAttrAt(ctx, index);
+  return phi::capi::Scalar(scalar);
+}
+
+template <>
+inline phi::capi::IntArray PD_AttrAt<phi::capi::IntArray>(PD_KernelContext *ctx,
+                                                          size_t index) {
+  auto int_array = PD_KernelContextIntArrayAttrAt(ctx, index);
+  return phi::capi::IntArray(int_array);
+}
+
+template <>
+inline phi::capi::Place PD_AttrAt<phi::capi::Place>(PD_KernelContext *ctx,
+                                                    size_t index) {
+  auto place = PD_KernelContextPlaceAttrAt(ctx, index);
+  return phi::capi::Place(place);
+}
+
+template <>
+inline std::vector<phi::capi::Scalar> PD_AttrAt<std::vector<phi::capi::Scalar>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto c_list = PD_KernelContextListScalarAttrAt(ctx, index);
+  auto data = reinterpret_cast<PD_Scalar **>(c_list.data);
+  std::vector<phi::capi::Scalar> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
+
+template <>
+inline std::vector<std::string> PD_AttrAt<std::vector<std::string>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto c_list = PD_KernelContextListScalarAttrAt(ctx, index);
+  auto data = reinterpret_cast<char **>(c_list.data);
+  std::vector<std::string> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
+
+template <>
+inline std::vector<bool> PD_AttrAt<std::vector<bool>>(PD_KernelContext *ctx,
+                                                      size_t index) {
+  auto c_list = PD_KernelContextListBoolAttrAt(ctx, index);
+  std::vector<bool> list;
+  auto data = reinterpret_cast<uint8_t *>(c_list.data);
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list[i] = static_cast<bool>(data[i]);
+  }
+  PD_DeleteUInt8List(c_list);
+  return list;
+}
+
+#define CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(_)                                 \
+  _(phi::capi::DenseTensor, ::PD_KernelArgumentType::PD_ARG_TYPE_TENSOR)    \
+  _(phi::capi::DeviceContext, ::PD_KernelArgumentType::PD_ARG_TYPE_CONTEXT) \
+  _(bool, ::PD_KernelArgumentType::PD_ARG_TYPE_BOOL)                        \
+  _(float, ::PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32)                    \
+  _(double, ::PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64)                   \
+  _(int32_t, ::PD_KernelArgumentType::PD_ARG_TYPE_INT32)                    \
+  _(int64_t, ::PD_KernelArgumentType::PD_ARG_TYPE_INT64)                    \
+  _(PD_DataType, ::PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE)            \
+  _(PD_DataLayout, ::PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT)        \
+  _(std::vector<int32_t>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32)  \
+  _(std::vector<int64_t>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64)  \
+  _(std::vector<float>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32)  \
+  _(std::vector<double>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64) \
+  _(std::vector<bool>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL)      \
+  _(std::string, ::PD_KernelArgumentType::PD_ARG_TYPE_STRING)               \
+  _(phi::capi::Scalar, ::PD_KernelArgumentType::PD_ARG_TYPE_SCALAR)         \
+  _(phi::capi::IntArray, ::PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY)    \
+  _(phi::capi::Place, ::PD_KernelArgumentType::PD_ARG_TYPE_PLACE)           \
+  _(std::vector<std::string>,                                               \
+    ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING)                       \
+  _(std::vector<phi::capi::Scalar>,                                         \
+    ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR)
+
+template <typename T>
+struct CppTypeToPDArgumentType;
+
+#define CPP_TYPE_TO_PD_ARG_TYPE(x, y)                             \
+  template <>                                                     \
+  struct CppTypeToPDArgumentType<x> {                             \
+    constexpr static ::PD_KernelArgumentType Type() { return y; } \
+  };
+
+template <::PD_KernelArgumentType T>
+struct PDArgumentTypeToCppType;
+
+#define PD_ARG_TYPE_TO_CPP_TYPE(x, y) \
+  template <>                         \
+  struct PDArgumentTypeToCppType<y> { \
+    using type = x;                   \
+  };
+
+CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(CPP_TYPE_TO_PD_ARG_TYPE)
+CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(PD_ARG_TYPE_TO_CPP_TYPE)
+
+}  // namespace capi
+
+using LoD = capi::LoD;
+using Context = capi::DeviceContext;
+using DenseTensor = capi::DenseTensor;
+using Scalar = capi::Scalar;
+using IntArray = capi::IntArray;
+using Place = capi::Place;
+using DataType = ::PD_DataType;
+using DataLayout = ::PD_DataLayout;
+
+}  // namespace phi
+
+#include "paddle/phi/capi/include/kernel_utils.h"
+
+// clang-format off
+
+#define PD_BUILD_PHI_KERNEL(kernel_name,                            \
+                            backend,                                \
+                            layout,                                 \
+                            meta_kernel_fn,                         \
+                            ...)                                    \
+  static void                                                       \
+      __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout(      \
+          const PD_KernelKey* kernel_key, PD_Kernel* kernel);       \
+  template <typename kernel_type>                                   \
+  struct __##kernel_name##_##backend##_##layout##__ {               \
+    __##kernel_name##_##backend##_##layout##__() {                  \
+      ::phi::capi::CustomKernelArgsParseFunctor<decltype(           \
+          &meta_kernel_fn<kernel_type>)>                            \
+          parser;                                                   \
+      PD_RegisterPhiKernel(                                         \
+          #kernel_name,                                             \
+          #backend,                                                 \
+          ::phi::capi::CppTypeToPDType<kernel_type>::Type(),        \
+          PD_DATALAYOUT(layout),                                    \
+          parser.in_args_type.size(),                               \
+          parser.in_args_type.data(),                               \
+          parser.attr_args_type.size(),                             \
+          parser.attr_args_type.data(),                             \
+          parser.out_args_type.size(),                              \
+          parser.out_args_type.data(),                              \
+          __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout,  \
+          CUSTOM_PHI_KERNEL(meta_kernel_fn<kernel_type>),           \
+          CUSTOM_PHI_VARIADIC_KERNEL(                               \
+            meta_kernel_fn<kernel_type>));                          \
+    }                                                               \
+    static void Touch() {}                                          \
+  };                                                                \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(              \
+      CUSTOM_tp_ns_check_##kernel_name##_##backend##_##layout,      \
+      "PD_BUILD_KERNEL must be called in global namespace.");       \
+  static void                                                       \
+      __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(       \
+          const ::phi::capi::KernelKey &kernel_key,                 \
+          ::phi::capi::Kernel* kernel);                             \
+  _PD_BUILD_PHI_KERNEL(__##kernel_name##_##backend##_##layout##__,  \
+                       kernel_name,                                 \
+                       backend,                                     \
+                       layout,                                      \
+                       meta_kernel_fn,                              \
+                       __VA_ARGS__)                                 \
+  void                                                              \
+      __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout(      \
+          const PD_KernelKey* kernel_key, PD_Kernel* kernel) {      \
+          auto cc_kernel = ::phi::capi::Kernel(kernel);             \
+          __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(   \
+            ::phi::capi::KernelKey(                                 \
+              const_cast<PD_KernelKey*>(kernel_key)),               \
+            &cc_kernel);                                            \
+      }                                                             \
+  void                                                              \
+      __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(       \
+          const ::phi::capi::KernelKey &kernel_key,                 \
+          ::phi::capi::Kernel* kernel)
+
+// clang-format on
+
+#endif
diff --git a/paddle/phi/capi/include/kernel_utils.h b/paddle/phi/capi/include/kernel_utils.h
new file mode 100644
index 0000000000000..7302e6f4677b3
--- /dev/null
+++ b/paddle/phi/capi/include/kernel_utils.h
@@ -0,0 +1,812 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/capi/include/common.h"
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+namespace phi {
+namespace capi {
+
+#define CUSTOM_PHI_KERNEL(...) \
+  ::phi::capi::CustomKernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+#define CUSTOM_PHI_VARIADIC_KERNEL(...)                      \
+  reinterpret_cast<void *>(                                  \
+      &::phi::capi::CustomKernelImpl<decltype(&__VA_ARGS__), \
+                                     &__VA_ARGS__>::VariadicCompute)
+
+#define PD_CUSTOM_NARGS(...) \
+  _PD_CUSTOM_NARGS((__VA_ARGS__, _PD_CUSTOM_RESQ_N()))
+#define _PD_CUSTOM_NARGS(...) _PD_CUSTOM_ARG_N(__VA_ARGS__)
+#define _PD_CUSTOM_ARG_N_EXPAND(                                              \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
+  N
+#define _PD_CUSTOM_ARG_N(args) _PD_CUSTOM_ARG_N_EXPAND args
+#define _PD_CUSTOM_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define PD_DATALAYOUT(arg__) PD_DataLayout::arg__
+
+#ifdef __COUNTER__
+#define PD_CUSTOM_PHI_KERNEL_ID __COUNTER__
+#else
+#define PD_CUSTOM_PHI_KERNEL_ID __LINE__
+#endif
+
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE(arg1, arg2) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE1(arg1, arg2)
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE1(arg1, arg2) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE2(arg1, arg2)
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_CUSTOM_PHI_KERNEL_EXPAND(x) x
+
+#define _PD_BUILD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE(_PD_BUILD_KERNEL_INSTANTIATION_, N)  \
+  (meta_kernel_fn, backend, __VA_ARGS__)
+
+#define _PD_BUILD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype) \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
+#define _PD_BUILD_KERNEL_INSTANTIATION_2(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_3(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_4(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_5(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_6(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_7(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_8(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_9(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_10(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_11(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_12(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_13(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_14(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_15(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_1(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype)                          \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  int TouchCustomKernelSymbolFor_##kernel_name##_##backend##_##layout() {     \
+    return 0;                                                                 \
+  }
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_2(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_1(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_3(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_2(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_4(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_3(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_5(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_4(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_6(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_5(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_7(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_6(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_8(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_7(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_9(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_8(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_10(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_9(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_11(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_10(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_12(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_11(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_13(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_12(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_14(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_13(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_15(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_14(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT(                                   \
+    N, registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(PD_CUSTOM_PHI_KERNEL_CONCATENATE(            \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_, N)(registrar_class,                \
+                                           kernel_name,                    \
+                                           backend,                        \
+                                           layout,                         \
+                                           PD_CUSTOM_PHI_KERNEL_ID,        \
+                                           meta_kernel_fn,                 \
+                                           __VA_ARGS__))
+
+#define PD_BUILD_KERNEL_REGISTRAR_INIT(                                 \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                          \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT(PD_CUSTOM_NARGS(__VA_ARGS__),     \
+                                      registrar_class,                  \
+                                      kernel_name,                      \
+                                      backend,                          \
+                                      layout,                           \
+                                      meta_kernel_fn,                   \
+                                      __VA_ARGS__))
+
+#define PD_BUILD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \
+  _PD_BUILD_KERNEL_INSTANTIATION(                                   \
+      PD_CUSTOM_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__)
+
+#define _PD_BUILD_2TA_KERNEL(                                           \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_BUILD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__);  \
+  PD_BUILD_KERNEL_REGISTRAR_INIT(registrar_class,                       \
+                                 kernel_name,                           \
+                                 backend,                               \
+                                 layout,                                \
+                                 meta_kernel_fn,                        \
+                                 __VA_ARGS__);
+
+#define _PD_BUILD_PHI_KERNEL(                                           \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(_PD_BUILD_2TA_KERNEL(registrar_class,     \
+                                                   kernel_name,         \
+                                                   backend,             \
+                                                   layout,              \
+                                                   meta_kernel_fn,      \
+                                                   __VA_ARGS__))
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)     \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const dev_ctx &, Tail...> {                  \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(in_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(                                                         \
+          attr_idx == 0,                                                     \
+          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      dev_ctx arg = PD_GetDeviceContext(ctx);                                \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type)      \
+  template <typename... Tail>                                            \
+  struct CustomKernelCallHelper<const tensor_type &, Tail...> {          \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
+      static_assert(attr_idx == 0,                                       \
+                    "Kernel's Input should appear before Attributes.");  \
+      static_assert(out_idx == 0,                                        \
+                    "Kernel's Input should appear before Outputs.");     \
+      const tensor_type arg = PD_InputAt(ctx, in_idx);                   \
+      CustomKernelCallHelper<Tail...>::                                  \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(  \
+              ctx, pargs..., arg);                                       \
+    }                                                                    \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type) \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const paddle::optional<tensor_type> &,       \
+                                Tail...> {                                   \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      auto arg = PD_OptionalInputAt(ctx, in_idx);                            \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<const std::vector<const tensor_type *> &, \
+                                Tail...> {                                \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(attr_idx == 0,                                        \
+                    "Kernel's Input should appear before Attributes.");   \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Input should appear before Outputs.");      \
+      auto arg = PD_MultiInputAt(ctx, in_idx);                            \
+      auto arg_wrapper = PD_GetPointerVector(&arg);                       \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(   \
+              ctx, pargs..., arg_wrapper);                                \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type)     \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<attr_type, Tail...> {                     \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(     \
+    attr_type)                                                            \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<const attr_type &, Tail...> {             \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type)     \
+  template <typename... Tail>                                            \
+  struct CustomKernelCallHelper<tensor_type *, Tail...> {                \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
+      auto arg = PD_OutputAt(ctx, out_idx);                              \
+      tensor_type *ptr = (arg.raw_data() ? &arg : nullptr);              \
+      CustomKernelCallHelper<Tail...>::                                  \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
+              ctx, pargs..., ptr);                                       \
+    }                                                                    \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \
+  template <typename... Tail>                                              \
+  struct CustomKernelCallHelper<std::vector<tensor_type *>, Tail...> {     \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {   \
+      auto arg = PD_MultiOutputAt(ctx, out_idx);                           \
+      auto arg_wrapper = PD_GetPointerVector(&arg);                        \
+      CustomKernelCallHelper<Tail...>::                                    \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg_wrapper);                                 \
+    }                                                                      \
+  }
+
+template <typename T>
+struct CustomTypeTag {};
+
+template <typename Fn, Fn fn>
+struct CustomKernelImpl;
+
+template <typename Return,
+          typename DevCtx,
+          typename... Args,
+          Return (*kernel_fn)(DevCtx, Args...)>
+struct CustomKernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
+  static void Compute(PD_KernelContext *ctx) {
+    CustomKernelCallHelper<DevCtx, Args..., CustomTypeTag<int>>::
+        template Compute<0, 0, 0, 0>(ctx);
+  }
+
+  static void VariadicCompute(const phi::capi::DeviceContext &dev_ctx,
+                              Args... args) {
+    return kernel_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct CustomKernelCallHelper;
+
+  /* DeviceContext Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(
+      phi::capi::DeviceContext);
+
+  /* Input Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(
+      phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(phi::capi::DenseTensor);
+
+  /* Attribute Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(int32_t);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(PD_DataType);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(PD_DataLayout);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(phi::capi::Place);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<bool>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int32_t>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int64_t>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<float>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<double>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(std::string);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<std::string>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      phi::capi::Scalar);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      phi::capi::IntArray);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<phi::capi::Scalar>);
+
+  /* Output Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(phi::capi::DenseTensor);
+
+  /* End case */
+  template <typename T>
+  struct CustomKernelCallHelper<CustomTypeTag<T>> {
+    template <int dev_ctx_idx, int in_idx, int attr_idx, int out_idx>
+    static void Compute(PD_KernelContext *ctx, DevCtx dev_ctx, Args &...args) {
+      static_assert(dev_ctx_idx > 0,
+                    "Kernel should pass DeviceContext as argument.");
+      static_assert(out_idx > 0, "Kernel should have output argument.");
+      return kernel_fn(dev_ctx, args...);
+    }
+  };
+};
+
+template <typename Func>
+struct CustomKernelArgsParseFunctor;
+
+template <typename Return_, typename... Args_>
+struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
+  using Args = std::tuple<Args_...>;
+  enum : std::size_t { Arity = sizeof...(Args_) };
+  using Indices = std::make_index_sequence<Arity>;
+  template <std::size_t Index>
+  using Arg = typename std::tuple_element<Index, Args>::type;
+
+  CustomKernelArgsParseFunctor() {
+    auto args_type = ParseArgType(Indices{});
+
+    for (auto arg_type : args_type) {
+      if (arg_type ==
+          std::type_index(typeid(const phi::capi::DeviceContext *))) {
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::DenseTensor &))) {
+        in_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(typeid(
+                     const paddle::optional<phi::capi::DenseTensor> &))) {
+        in_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(typeid(
+                     const std::vector<const phi::capi::DenseTensor *> &))) {
+        in_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(
+                     typeid(const paddle::optional<
+                            std::vector<const phi::capi::DenseTensor *>> &))) {
+        in_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR);
+      } else if (arg_type == std::type_index(typeid(bool))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_BOOL);
+      } else if (arg_type == std::type_index(typeid(float))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32);
+      } else if (arg_type == std::type_index(typeid(double))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64);
+      } else if (arg_type == std::type_index(typeid(int32_t))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT32);
+      } else if (arg_type == std::type_index(typeid(int64_t))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::Place &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_PLACE);
+      } else if (arg_type == std::type_index(typeid(const std::string &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_STRING);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<bool> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<float> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<double> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<int32_t> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<int64_t> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<std::string> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<phi::capi::Scalar> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::Scalar &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_SCALAR);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::IntArray &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY);
+      } else if (arg_type == std::type_index(typeid(PD_DataType))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE);
+      } else if (arg_type == std::type_index(typeid(PD_DataLayout))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT);
+      } else if (arg_type == std::type_index(typeid(PD_DataLayout))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_PLACE);
+      } else if (arg_type ==
+                 std::type_index(typeid(phi::capi::DenseTensor *))) {
+        out_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_TENSOR);
+      } else if (arg_type == std::type_index(typeid(
+                                 std::vector<phi::capi::DenseTensor *>))) {
+        out_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR);
+      }
+    }
+  }
+
+  std::vector<PD_KernelArgumentType> in_args_type;
+  std::vector<PD_KernelArgumentType> attr_args_type;
+  std::vector<PD_KernelArgumentType> out_args_type;
+
+ private:
+  template <std::size_t... INDEX>
+  static std::vector<std::type_index> ParseArgType(
+      std::index_sequence<INDEX...>) {
+    return {std::type_index(typeid(Arg<INDEX>))...};
+  }
+};
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/type_utils.h b/paddle/phi/capi/include/type_utils.h
new file mode 100644
index 0000000000000..ed892c881d715
--- /dev/null
+++ b/paddle/phi/capi/include/type_utils.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace capi {
+
+inline PD_DataType ToPDDataType(::paddle::experimental::DataType dtype) {
+#define return_result(in, ret)               \
+  case ::paddle::experimental::DataType::in: \
+    return PD_DataType::ret
+  switch (dtype) {
+    return_result(UNDEFINED, UNDEFINED);
+    return_result(FLOAT64, FLOAT64);
+    return_result(FLOAT32, FLOAT32);
+    return_result(FLOAT16, FLOAT16);
+    return_result(BFLOAT16, BFLOAT16);
+    return_result(INT64, INT64);
+    return_result(INT32, INT32);
+    return_result(INT16, INT16);
+    return_result(INT8, INT8);
+    return_result(UINT64, UINT64);
+    return_result(UINT32, UINT32);
+    return_result(UINT16, UINT16);
+    return_result(UINT8, UINT8);
+    return_result(BOOL, BOOL);
+    default: {
+      PADDLE_THROW(
+          ::phi::errors::Unavailable("DataType %d is not supported.", dtype));
+    }
+  }
+#undef return_result
+}
+
+inline ::paddle::experimental::DataType ToPhiDataType(PD_DataType dtype) {
+#define return_result(in, ret) \
+  case PD_DataType::in:        \
+    return ::paddle::experimental::DataType::ret
+  switch (dtype) {
+    return_result(UNDEFINED, UNDEFINED);
+    return_result(FLOAT64, FLOAT64);
+    return_result(FLOAT32, FLOAT32);
+    return_result(FLOAT16, FLOAT16);
+    return_result(BFLOAT16, BFLOAT16);
+    return_result(INT64, INT64);
+    return_result(INT32, INT32);
+    return_result(INT16, INT16);
+    return_result(INT8, INT8);
+    return_result(UINT64, UINT64);
+    return_result(UINT32, UINT32);
+    return_result(UINT16, UINT16);
+    return_result(UINT8, UINT8);
+    return_result(BOOL, BOOL);
+    default: {
+      PADDLE_THROW(
+          ::phi::errors::Unavailable("DataType %d is not supported.", dtype));
+      return ::paddle::experimental::DataType::UNDEFINED;
+    }
+  }
+#undef return_result
+}
+
+inline PD_DataLayout ToPDDataLayout(::paddle::experimental::DataLayout layout) {
+#define return_result(in, ret)                 \
+  case ::paddle::experimental::DataLayout::in: \
+    return PD_DataLayout::ret
+  switch (layout) {
+    return_result(ANY, ANY);
+    return_result(NHWC, NHWC);
+    return_result(NCHW, NCHW);
+    return_result(NCDHW, NCDHW);
+    return_result(NDHWC, NDHWC);
+    default: {
+      PADDLE_THROW(::phi::errors::Unavailable("DataLayout %d is not supported.",
+                                              layout));
+      return PD_DataLayout::ANY;
+    }
+  }
+#undef return_result
+}
+
+inline ::paddle::experimental::DataLayout ToPhiDataLayout(
+    PD_DataLayout layout) {
+#define return_result(in, ret) \
+  case PD_DataLayout::in:      \
+    return ::paddle::experimental::DataLayout::ret
+  switch (layout) {
+    return_result(ANY, ANY);
+    return_result(NHWC, NHWC);
+    return_result(NCHW, NCHW);
+    return_result(NCDHW, NCDHW);
+    return_result(NDHWC, NDHWC);
+    default: {
+      PADDLE_THROW(::phi::errors::Unavailable("DataLayout %d is not supported.",
+                                              layout));
+      return ::paddle::experimental::DataLayout::ANY;
+    }
+  }
+#undef return_result
+}
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
new file mode 100644
index 0000000000000..2b5421bc266cf
--- /dev/null
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -0,0 +1,497 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <vector>
+
+#include "paddle/phi/api/ext/exception.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+#include "paddle/phi/capi/include/data_type.h"
+#include "paddle/utils/optional.h"
+
+#define PD_CHECK_STATUS(status) PD_CHECK(status == C_SUCCESS)
+
+namespace phi {
+
+namespace capi {
+
+using LoD = std::vector<std::vector<size_t>>;
+
+template <typename T>
+static inline PD_List PDListFromVector(std::vector<T>* vec) {
+  PD_List list;
+  list.data = reinterpret_cast<void*>(vec->data());
+  list.size = vec->size();
+  return list;
+}
+
+template <typename T>
+static inline std::vector<T> PDListToVector(PD_List list) {
+  return std::vector<T>(static_cast<T*>(list.data),
+                        static_cast<T*>(list.data) + list.size);
+}
+
+inline std::vector<int64_t> PD_TensorGetDims(PD_Tensor* tensor,
+                                             PD_Status* status) {
+  int64_t ndims = PD_TensorGetNumDims(tensor, status);
+  if (ndims > 0) {
+    std::vector<int64_t> shape(ndims);
+    for (int64_t i = 0; i < ndims; ++i) {
+      shape[i] = PD_TensorGetDim(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
+template <typename T>
+class WrapperBase {
+ public:
+  explicit WrapperBase(T* ptr, bool own = false) : data_(ptr), own_(own) {}
+
+  inline T* raw_data() const { return data_; }
+
+  inline bool own_data() const { return own_; }
+
+  inline void reset(const T* ptr) { data_ = ptr; }
+
+ private:
+  T* data_;
+  bool own_;
+};
+
+class DenseTensor : public WrapperBase<PD_Tensor> {
+ public:
+  DenseTensor() : WrapperBase(PD_NewTensor(), true) {}
+
+  explicit DenseTensor(PD_Tensor* tensor) : WrapperBase(tensor) {}
+
+  ~DenseTensor() {
+    if (own_data()) {
+      PD_DeleteTensor(raw_data());
+    }
+  }
+
+  bool valid() const {
+    C_Status status;
+    auto ret = PD_TensorIsValid(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return ret;
+  }
+
+  bool initialized() const {
+    C_Status status;
+    auto ret = PD_TensorIsInitialized(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return ret;
+  }
+
+  void* Holder() const {
+    C_Status status;
+    auto holder = PD_TensorGetHolder(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return holder;
+  }
+
+  std::vector<int64_t> dims() const {
+    C_Status status;
+    auto dimension = PD_TensorGetDims(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return dimension;
+  }
+
+  PD_DataType dtype() const {
+    C_Status status;
+    auto data_type = PD_TensorGetDataType(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return data_type;
+  }
+
+  PD_DataLayout layout() const {
+    C_Status status;
+    auto data_layout = PD_TensorGetDataLayout(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return data_layout;
+  }
+
+  int64_t numel() const {
+    C_Status status;
+    auto element_count = PD_TensorGetElementCount(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return element_count;
+  }
+
+  int64_t memory_size() const {
+    C_Status status;
+    auto byte_size = PD_TensorGetByteSize(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return byte_size;
+  }
+
+  LoD lod() const {
+    PD_List data, offset;
+    C_Status status;
+    PD_TensorGetLoD(raw_data(), &data, &offset, &status);
+    PD_CHECK_STATUS(status);
+    LoD lod_;
+    auto ptr = static_cast<size_t*>(data.data);
+    auto offset_ptr = static_cast<size_t*>(offset.data);
+    for (size_t i = 0; i < offset.size - 1; ++i) {
+      lod_.emplace_back(ptr + offset_ptr[i], ptr + offset_ptr[i + 1]);
+    }
+    delete[] ptr;
+    delete[] offset_ptr;
+    return lod_;
+  }
+
+  void ResetLoD(const LoD& lod) {
+    std::vector<size_t> data, offset;
+    offset.push_back(0);
+    for (const auto& item : lod) {
+      data.insert(data.cend(), item.cbegin(), item.cend());
+      offset.push_back(item.size());
+    }
+    PD_List data_list, offset_list;
+    data_list = PDListFromVector(&data);
+    offset_list = PDListFromVector(&offset);
+
+    C_Status status;
+    PD_TensorResetLoD(raw_data(), data_list, offset_list, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void Resize(const std::vector<int64_t>& dims) {
+    C_Status status;
+    PD_TensorSetDims(raw_data(), dims.size(), dims.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_dtype(PD_DataType data_type) {
+    C_Status status;
+    PD_TensorSetDataType(raw_data(), data_type, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_layout(PD_DataLayout data_layout) {
+    C_Status status;
+    PD_TensorSetDataLayout(raw_data(), data_layout, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  template <typename T>
+  T* data() const {
+    C_Status status;
+    auto ptr = PD_TensorGetDataPointer(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+
+  // template <typename T>
+  // T* mutable_data(int64_t size = 0, const PD_DeviceContext* ctx = nullptr) {
+  //   C_Status status;
+  //   auto ptr = PD_DeviceContextAllocateTensor(
+  //       ctx, raw_data(), size, phi::capi::CppTypeToPDType<T>::Type(),
+  //       &status);
+  //   PD_CHECK_STATUS(status);
+  //   return static_cast<T*>(ptr);
+  // }
+
+  // void* mutable_data(PD_DataType data_type,
+  //                    int64_t size = 0,
+  //                    const PD_DeviceContext* ctx = nullptr) {
+  //   C_Status status;
+  //   auto ptr = PD_DeviceContextAllocateTensor(
+  //       ctx, raw_data(), size, data_type, &status);
+  //   PD_CHECK_STATUS(status);
+  //   return static_cast<void*>(ptr);
+  // }
+
+  DenseTensor& ShareDataWith(const DenseTensor& src) {
+    C_Status status;
+    PD_TensorShareDataWith(raw_data(), src.raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+
+  void share_lod(const DenseTensor& src) {
+    C_Status status;
+    PD_TensorShareLoDWith(raw_data(), src.raw_data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+};
+
+class DeviceContext : public WrapperBase<PD_DeviceContext> {
+ public:
+  explicit DeviceContext(PD_DeviceContext* context)
+      : WrapperBase<PD_DeviceContext>(context) {}
+
+  void* stream() const {
+    C_Status status;
+    auto stream_ = PD_DeviceContextGetStream(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return stream_;
+  }
+
+  void* Alloc(DenseTensor* tensor,
+              PD_DataType dtype,
+              int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr = PD_DeviceContextAllocateTensor(
+        raw_data(), tensor->raw_data(), requested_size, dtype, &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<void*>(ptr);
+  }
+
+  template <typename T>
+  T* Alloc(DenseTensor* tensor, int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr =
+        PD_DeviceContextAllocateTensor(raw_data(),
+                                       tensor->raw_data(),
+                                       requested_size,
+                                       phi::capi::CppTypeToPDType<T>::Type(),
+                                       &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+
+  void* HostAlloc(DenseTensor* tensor,
+                  PD_DataType dtype,
+                  int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr = PD_DeviceContextAllocateTensor(
+        nullptr, tensor->raw_data(), requested_size, dtype, &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<void*>(ptr);
+  }
+
+  template <typename T>
+  T* HostAlloc(DenseTensor* tensor, int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr =
+        PD_DeviceContextAllocateTensor(nullptr,
+                                       tensor->raw_data(),
+                                       requested_size,
+                                       phi::capi::CppTypeToPDType<T>::Type(),
+                                       &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+};
+
+class Scalar : public WrapperBase<PD_Scalar> {
+ public:
+  explicit Scalar(PD_Scalar* scalar) : WrapperBase<PD_Scalar>(scalar) {}
+
+  PD_DataType dtype() const { return PD_ScalarGetDataType(raw_data()); }
+
+  template <typename T>
+  inline T to() const;
+};
+
+template <>
+inline bool Scalar::to<bool>() const {
+  return PD_ScalarGetBoolData(raw_data());
+}
+
+template <>
+inline float Scalar::to<float>() const {
+  return PD_ScalarGetFloat32Data(raw_data());
+}
+
+template <>
+inline double Scalar::to<double>() const {
+  return PD_ScalarGetFloat64Data(raw_data());
+}
+
+template <>
+inline uint8_t Scalar::to<uint8_t>() const {
+  return PD_ScalarGetUInt8Data(raw_data());
+}
+
+template <>
+inline uint16_t Scalar::to<uint16_t>() const {
+  return PD_ScalarGetUInt16Data(raw_data());
+}
+
+template <>
+inline uint32_t Scalar::to<uint32_t>() const {
+  return PD_ScalarGetUInt32Data(raw_data());
+}
+
+template <>
+inline uint64_t Scalar::to<uint64_t>() const {
+  return PD_ScalarGetUInt64Data(raw_data());
+}
+
+template <>
+inline int8_t Scalar::to<int8_t>() const {
+  return PD_ScalarGetInt8Data(raw_data());
+}
+
+template <>
+inline int16_t Scalar::to<int16_t>() const {
+  return PD_ScalarGetInt16Data(raw_data());
+}
+
+template <>
+inline int32_t Scalar::to<int32_t>() const {
+  return PD_ScalarGetInt32Data(raw_data());
+}
+
+template <>
+inline int64_t Scalar::to<int64_t>() const {
+  return PD_ScalarGetInt64Data(raw_data());
+}
+
+class IntArray : WrapperBase<PD_IntArray> {
+ public:
+  explicit IntArray(PD_IntArray* int_array)
+      : WrapperBase<PD_IntArray>(int_array) {}
+
+  size_t size() const { return PD_IntArrayGetElementCount(raw_data()); }
+
+  std::vector<int64_t> GetData() const {
+    auto list = PD_IntArrayGetDataPointer(raw_data());
+    auto data = reinterpret_cast<int64_t*>(list.data);
+    std::vector<int64_t> ret(data, data + list.size);
+    return ret;
+  }
+};
+
+class Place : WrapperBase<PD_Place> {
+ public:
+  explicit Place(PD_Place* place) : WrapperBase<PD_Place>(place) {}
+
+  bool is_host() { return PD_PlaceIsHost(raw_data()); }
+
+  int8_t GetDeviceID() { return PD_PlaceGetDeviceId(raw_data()); }
+};
+
+class TensorArgDef : WrapperBase<PD_TensorArgDef> {
+ public:
+  explicit TensorArgDef(PD_TensorArgDef* tensor_arg_def)
+      : WrapperBase<PD_TensorArgDef>(tensor_arg_def) {}
+
+  // TensorArgDef& SetBackend() {
+  //   return *this;
+  // }
+
+  TensorArgDef& SetDataLayout(PD_DataLayout in_layout) {
+    C_Status status;
+    PD_TensorArgDefSetDataLayout(raw_data(), in_layout, &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+
+  TensorArgDef& SetDataType(PD_DataType in_dtype) {
+    C_Status status;
+    PD_TensorArgDefSetDataType(raw_data(), in_dtype, &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+};
+
+class KernelArgsDef : WrapperBase<PD_KernelArgsDef> {
+ public:
+  explicit KernelArgsDef(PD_KernelArgsDef* kernel_args_def)
+      : WrapperBase<PD_KernelArgsDef>(kernel_args_def) {}
+
+  std::vector<TensorArgDef> input_defs() {
+    C_Status status;
+    auto list = PD_KernelArgsDefGetInputArgDefs(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    auto ptr = reinterpret_cast<PD_TensorArgDef**>(list.data);
+    std::vector<TensorArgDef> ret;
+    for (size_t i = 0; i < list.size; ++i) {
+      ret.emplace_back(ptr[i]);
+    }
+    PD_DeletePointerList(list);
+    return ret;
+  }
+
+  std::vector<TensorArgDef> output_defs() {
+    C_Status status;
+    auto list = PD_KernelArgsDefGetOutputArgDefs(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    auto ptr = reinterpret_cast<PD_TensorArgDef**>(list.data);
+    std::vector<TensorArgDef> ret;
+    for (size_t i = 0; i < list.size; ++i) {
+      ret.emplace_back(ptr[i]);
+    }
+    PD_DeletePointerList(list);
+    return ret;
+  }
+
+  // std::vector<AttributeArgDef>
+  // attribute_defs() {
+  // }
+};
+
+class KernelKey : WrapperBase<PD_KernelKey> {
+ public:
+  explicit KernelKey(PD_KernelKey* kernel_key)
+      : WrapperBase<PD_KernelKey>(kernel_key) {}
+
+  // Backend backend() const { return backend_; }
+  PD_DataLayout layout() const {
+    PD_Status status;
+    auto layout_ = PD_KernelKeyGetLayout(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return layout_;
+  }
+
+  PD_DataType dtype() const {
+    PD_Status status;
+    auto dtype_ = PD_KernelKeyGetDataType(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return dtype_;
+  }
+};
+
+class Kernel : WrapperBase<PD_Kernel> {
+ public:
+  explicit Kernel(PD_Kernel* kernel) : WrapperBase<PD_Kernel>(kernel) {}
+
+  KernelArgsDef args_def() const {
+    C_Status status;
+    auto ptr = PD_KernelGetArgsDef(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return KernelArgsDef(ptr);
+  }
+
+  TensorArgDef InputAt(size_t idx) { return args_def().input_defs()[idx]; }
+
+  TensorArgDef OutputAt(size_t idx) { return args_def().input_defs()[idx]; }
+};
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/lib/CMakeLists.txt b/paddle/phi/capi/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..de335bb668bdf
--- /dev/null
+++ b/paddle/phi/capi/lib/CMakeLists.txt
@@ -0,0 +1,44 @@
+cc_library(
+  phi_c_data_type
+  SRCS c_data_type.cc
+  DEPS dense_tensor)
+
+cc_library(
+  phi_c_device_context
+  SRCS c_device_context.cc
+  DEPS phi_context)
+
+cc_library(
+  phi_c_int_array
+  SRCS c_int_array.cc
+  DEPS int_array)
+
+cc_library(
+  phi_c_kernel_context
+  SRCS c_kernel_context.cc
+  DEPS kernel_context)
+
+cc_library(
+  phi_c_kernel_factory
+  SRCS c_kernel_factory.cc
+  DEPS kernel_factory)
+
+cc_library(
+  phi_c_kernel_registry
+  SRCS c_kernel_registry.cc
+  DEPS dense_tensor)
+
+cc_library(
+  phi_c_place
+  SRCS c_place.cc
+  DEPS phi_place)
+
+cc_library(
+  phi_c_scalar
+  SRCS c_scalar.cc
+  DEPS scalar)
+
+cc_library(
+  phi_c_tensor
+  SRCS c_tensor.cc
+  DEPS dense_tensor)
diff --git a/paddle/phi/capi/lib/c_data_type.cc b/paddle/phi/capi/lib/c_data_type.cc
new file mode 100644
index 0000000000000..547df06338f0f
--- /dev/null
+++ b/paddle/phi/capi/lib/c_data_type.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#include "paddle/phi/capi/include/common.h"
+
+void PD_DeletePointerList(PD_List list) {
+  auto data = reinterpret_cast<void**>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteUInt8List(PD_List list) {
+  auto data = reinterpret_cast<uint8_t*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteInt64List(PD_List list) {
+  auto data = reinterpret_cast<int64_t*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteInt32List(PD_List list) {
+  auto data = reinterpret_cast<int32_t*>(list.data);
+  delete[] data;
+}
+
+void PD_DeleteFloat64List(PD_List list) {
+  auto data = reinterpret_cast<double*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteFloat32List(PD_List list) {
+  auto data = reinterpret_cast<float*>(list.data);
+  if (data) delete[] data;
+}
+
+PD_REGISTER_CAPI(data_type);
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
new file mode 100644
index 0000000000000..96b46fbc0d4ff
--- /dev/null
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_device_context.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
+                                    PD_Status* status) {
+  if (status) {
+    if (!ctx) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto dev_ctx_type =
+      reinterpret_cast<const phi::CustomContext*>(ctx)->GetPlace().GetType();
+  if (dev_ctx_type == phi::AllocationType::CUSTOM) {
+    return reinterpret_cast<PD_Stream>(
+        reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
+  } else if (dev_ctx_type == phi::AllocationType::CPU) {
+    return nullptr;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (dev_ctx_type == phi::AllocationType::GPU) {
+    return reinterpret_cast<PD_Stream>(
+        reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
+#endif
+#ifdef PADDLE_WITH_XPU
+  } else if (dev_ctx_type == phi::AllocationType::XPU) {
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Only support Custom/CPU/GPU/XPU DeviceContext"));
+  }
+}
+
+void* PD_DeviceContextAllocateTensor(const PD_DeviceContext* ctx,
+                                     PD_Tensor* tensor,
+                                     size_t size,
+                                     PD_DataType dtype,
+                                     PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto dev_ctx = reinterpret_cast<const phi::DeviceContext*>(ctx);
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  auto phi_dtype = phi::capi::ToPhiDataType(dtype);
+  if (ctx) {
+    return dev_ctx->Alloc(cc_tensor, phi_dtype, size);
+  } else {
+    auto place = phi::CPUPlace();
+    return cc_tensor->mutable_data(place, phi_dtype, size);
+  }
+}
+
+PD_REGISTER_CAPI(device_context);
diff --git a/paddle/phi/capi/lib/c_int_array.cc b/paddle/phi/capi/lib/c_int_array.cc
new file mode 100644
index 0000000000000..7562700372c3b
--- /dev/null
+++ b/paddle/phi/capi/lib/c_int_array.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_int_array.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/common/int_array.h"
+
+PD_List PD_IntArrayGetDataPointer(PD_IntArray* int_array) {
+  auto cc_int_array = reinterpret_cast<phi::IntArray*>(int_array);
+  const auto& data = cc_int_array->GetData();
+  PD_List list;
+  list.size = data.size();
+  list.data = const_cast<int64_t*>(data.data());
+  return list;
+}
+
+size_t PD_IntArrayGetSize(PD_IntArray* int_array) {
+  auto cc_int_array = reinterpret_cast<phi::IntArray*>(int_array);
+  return cc_int_array->size();
+}
+
+PD_REGISTER_CAPI(int_array);
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
new file mode 100644
index 0000000000000..2e14b019c19ff
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -0,0 +1,223 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_context.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+
+PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  auto dev_ctx_type = kernel_context->GetDeviceContext<phi::DeviceContext>()
+                          .GetPlace()
+                          .GetType();
+  if (dev_ctx_type == phi::AllocationType::CUSTOM) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CustomContext*>(
+        &kernel_context->GetDeviceContext<phi::CustomContext>()));
+  } else if (dev_ctx_type == phi::AllocationType::CPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
+        &kernel_context->GetDeviceContext<phi::CPUContext>()));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (dev_ctx_type == phi::AllocationType::GPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
+        &kernel_context->GetDeviceContext<phi::GPUContext>()));
+#endif
+#ifdef PADDLE_WITH_XPU
+  } else if (dev_ctx_type == phi::AllocationType::XPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::XPUContext*>(
+        &kernel_context->GetDeviceContext<phi::XPUContext>()));
+#endif
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Only support Custom/CPU/GPU/XPU DeviceContext"));
+  }
+}
+
+PD_Tensor* PD_KernelContextInputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->InputRangeAt(index);
+  return reinterpret_cast<PD_Tensor*>(const_cast<phi::DenseTensor*>(
+      &kernel_context->InputAt<phi::DenseTensor>(range.first)));
+}
+
+PD_List PD_KernelContextMultiInputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->InputRangeAt(index);
+  auto tensor_vec = kernel_context->InputsBetween<phi::DenseTensor>(
+      range.first, range.second);
+  PD_List list;
+  list.size = tensor_vec.size();
+  list.data = tensor_vec.data();
+  return list;
+}
+
+PD_Tensor* PD_KernelContextOutputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->OutputRangeAt(index);
+  return reinterpret_cast<PD_Tensor*>(
+      kernel_context->MutableOutputAt<phi::DenseTensor>(range.first));
+}
+
+PD_List PD_KernelContextMultiOutputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->OutputRangeAt(index);
+  auto tensor_vec = kernel_context->MutableOutputBetween<phi::DenseTensor>(
+      range.first, range.second);
+  PD_List list;
+  list.size = tensor_vec.size();
+  list.data = tensor_vec.data();
+  return list;
+}
+
+bool PD_KernelContextBoolAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<bool>(index);
+}
+
+int32_t PD_KernelContextInt32AttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<int32_t>(index);
+}
+
+int64_t PD_KernelContextInt64AttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<int64_t>(index);
+}
+
+float PD_KernelContextFloatAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<float>(index);
+}
+
+double PD_KernelContextDoubleAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<double>(index);
+}
+
+PD_Scalar* PD_KernelContextScalarAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_Scalar*>(
+      const_cast<phi::Scalar*>(&kernel_context->AttrAt<phi::Scalar>(index)));
+}
+
+PD_IntArray* PD_KernelContextIntArrayAttrAt(PD_KernelContext* ctx,
+                                            size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_IntArray*>(const_cast<phi::IntArray*>(
+      &kernel_context->AttrAt<phi::IntArray>(index)));
+}
+
+PD_List PD_KernelContextListBoolAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<bool>>(index);
+  list.size = cc_list.size();
+  auto data = reinterpret_cast<uint8_t*>(new uint8_t[cc_list.size()]);
+  for (size_t i = 0; i < cc_list.size(); ++i) {
+    data[i] = static_cast<uint8_t>(cc_list[i]);
+  }
+  list.data = data;
+  return list;
+}
+
+PD_List PD_KernelContextListInt32AttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<int32_t>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<int32_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListInt64AttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<int64_t>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<int64_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListFloatAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<float>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<float*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListDoubleAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<double>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<double*>(cc_list.data());
+  return list;
+}
+
+char* PD_KernelContextStringAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return const_cast<char*>(kernel_context->AttrAt<std::string>(index).data());
+}
+
+PD_List PD_KernelContextListStringAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<std::string>>(index);
+  list.size = cc_list.size();
+  auto data = new char*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] = const_cast<char*>(cc_list[i].data());
+  }
+  list.data = reinterpret_cast<void*>(data);
+  return list;
+}
+
+PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<phi::Scalar>>(index);
+  list.size = cc_list.size();
+  auto data = new PD_Scalar*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] =
+        const_cast<PD_Scalar*>(reinterpret_cast<const PD_Scalar*>(&cc_list[i]));
+  }
+  list.data = data;
+  return list;
+}
+
+PD_Place* PD_KernelContextPlaceAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_Place*>(
+      const_cast<phi::Place*>(&kernel_context->AttrAt<phi::Place>(index)));
+}
+
+PD_DataType PD_KernelContextDataTypeAttrAt(PD_KernelContext* ctx,
+                                           size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return phi::capi::ToPDDataType(kernel_context->AttrAt<phi::DataType>(index));
+}
+
+PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext* ctx,
+                                               size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return phi::capi::ToPDDataLayout(
+      kernel_context->AttrAt<phi::DataLayout>(index));
+}
+
+PD_REGISTER_CAPI(kernel_context);
diff --git a/paddle/phi/capi/lib/c_kernel_factory.cc b/paddle/phi/capi/lib/c_kernel_factory.cc
new file mode 100644
index 0000000000000..8bf94467b472a
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_factory.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+/**
+ * TensorArgDef
+ */
+
+void PD_TensorArgDefSetDataLayout(PD_TensorArgDef* def,
+                                  PD_DataLayout layout,
+                                  PD_Status* status) {
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_def = reinterpret_cast<phi::TensorArgDef*>(def);
+  cc_def->SetDataLayout(phi::capi::ToPhiDataLayout(layout));
+}
+
+void PD_TensorArgDefSetDataType(PD_TensorArgDef* def,
+                                PD_DataType dtype,
+                                PD_Status* status) {
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_def = reinterpret_cast<phi::TensorArgDef*>(def);
+  cc_def->SetDataType(phi::capi::ToPhiDataType(dtype));
+}
+
+/**
+ * KernelArgsDef
+ */
+
+PD_List PD_KernelArgsDefGetInputArgDefs(PD_KernelArgsDef* def,
+                                        PD_Status* status) {
+  PD_List list;
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      list.size = 0;
+      list.data = nullptr;
+      return list;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_def = reinterpret_cast<phi::KernelArgsDef*>(def);
+  auto& arg_defs = cc_def->input_defs();
+  list.size = arg_defs.size();
+  auto ptr = new PD_TensorArgDef*[list.size];
+  list.data = ptr;
+  for (size_t i = 0; i < list.size; ++i) {
+    ptr[i] = reinterpret_cast<PD_TensorArgDef*>(&arg_defs[i]);
+  }
+  return list;
+}
+
+PD_List PD_KernelArgsDefGetOutputArgDefs(PD_KernelArgsDef* def,
+                                         PD_Status* status) {
+  PD_List list;
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      list.size = 0;
+      list.data = nullptr;
+      return list;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_def = reinterpret_cast<phi::KernelArgsDef*>(def);
+  auto& arg_defs = cc_def->output_defs();
+  list.size = arg_defs.size();
+  auto ptr = new PD_TensorArgDef*[list.size];
+  list.data = ptr;
+  for (size_t i = 0; i < list.size; ++i) {
+    ptr[i] = reinterpret_cast<PD_TensorArgDef*>(&arg_defs[i]);
+  }
+  return list;
+}
+
+/**
+ * KernelKey
+ */
+
+PD_DataLayout PD_KernelKeyGetLayout(PD_KernelKey* key, PD_Status* status) {
+  if (status) {
+    if (!key) {
+      *status = C_FAILED;
+      return PD_DataLayout::ALL_LAYOUT;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_key = reinterpret_cast<phi::KernelKey*>(key);
+  return phi::capi::ToPDDataLayout(cc_key->layout());
+}
+
+PD_DataType PD_KernelKeyGetDataType(PD_KernelKey* key, PD_Status* status) {
+  if (status) {
+    if (!key) {
+      *status = C_FAILED;
+      return PD_DataType::UNDEFINED;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_key = reinterpret_cast<phi::KernelKey*>(key);
+  return phi::capi::ToPDDataType(cc_key->dtype());
+}
+
+/**
+ * Kernel
+ */
+
+PD_KernelArgsDef* PD_KernelGetArgsDef(PD_Kernel* kernel, PD_Status* status) {
+  if (status) {
+    if (!kernel) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_kernel = reinterpret_cast<phi::Kernel*>(kernel);
+  return reinterpret_cast<PD_KernelArgsDef*>(
+      const_cast<phi::KernelArgsDef*>(&cc_kernel->args_def()));
+}
+
+PD_REGISTER_CAPI(kernel_factory);
diff --git a/paddle/phi/capi/lib/c_kernel_registry.cc b/paddle/phi/capi/lib/c_kernel_registry.cc
new file mode 100644
index 0000000000000..6cf6208856bfa
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_registry.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+void PD_KernelArgsParseFn(const phi::KernelKey& default_key,
+                          phi::KernelArgsDef* args_def,
+                          size_t in_nargs,
+                          PD_KernelArgumentType* in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType* attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType* out_args_type) {
+  auto default_tensor_layout = phi::DataLayout::NCHW;
+  if (default_key.layout() != phi::DataLayout::ANY) {
+    default_tensor_layout = default_key.layout();
+  }
+  // inputs
+  for (size_t i = 0; i < in_nargs; ++i) {
+    auto arg_type = in_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_CONTEXT) {
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_TENSOR) {
+      args_def->AppendInput(default_key.backend(),
+                            default_tensor_layout,
+                            default_key.dtype(),
+                            std::type_index(typeid(const phi::DenseTensor&)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(const paddle::optional<phi::DenseTensor>&)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(const std::vector<const phi::DenseTensor*>&)));
+    } else if (arg_type ==
+               PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(
+              const paddle::optional<std::vector<const phi::DenseTensor*>>&)));
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+  // attributes
+  for (size_t i = 0; i < attr_nargs; ++i) {
+    auto arg_type = attr_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_BOOL) {
+      args_def->AppendAttribute(phi::AttributeType::BOOL);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT32);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT64);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT32) {
+      args_def->AppendAttribute(phi::AttributeType::INT32);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT64) {
+      args_def->AppendAttribute(phi::AttributeType::INT64);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_STRING) {
+      args_def->AppendAttribute(phi::AttributeType::STRING);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_SCALAR) {
+      args_def->AppendAttribute(phi::AttributeType::SCALAR);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY) {
+      args_def->AppendAttribute(phi::AttributeType::INT_ARRAY);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE) {
+      args_def->AppendAttribute(phi::AttributeType::DATA_TYPE);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT) {
+      args_def->AppendAttribute(phi::AttributeType::DATA_LAYOUT);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_PLACE) {
+      args_def->AppendAttribute(phi::AttributeType::PLACE);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL) {
+      args_def->AppendAttribute(phi::AttributeType::BOOLS);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32) {
+      args_def->AppendAttribute(phi::AttributeType::INT32S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64) {
+      args_def->AppendAttribute(phi::AttributeType::INT64S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT32S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT64S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING) {
+      args_def->AppendAttribute(phi::AttributeType::STRINGS);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR) {
+      args_def->AppendAttribute(phi::AttributeType::SCALARS);
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+  // outputs
+  for (size_t i = 0; i < out_nargs; ++i) {
+    auto arg_type = out_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_TENSOR) {
+      args_def->AppendOutput(default_key.backend(),
+                             default_tensor_layout,
+                             default_key.dtype(),
+                             std::type_index(typeid(phi::DenseTensor*)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR) {
+      args_def->AppendOutput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(std::vector<phi::DenseTensor*>)));
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+}
+
+void PD_RegisterPhiKernel(const char* kernel_name_cstr,
+                          const char* backend_cstr,
+                          PD_DataType pd_dtype,
+                          PD_DataLayout pd_layout,
+                          size_t in_nargs,
+                          PD_KernelArgumentType* in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType* attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType* out_args_type,
+                          void (*args_def_fn)(const PD_KernelKey*, PD_Kernel*),
+                          void (*fn)(PD_KernelContext*),
+                          void* variadic_kernel_fn) {
+  auto args_def_fn_wrapper = [args_def_fn](const phi::KernelKey& kernel_key,
+                                           phi::Kernel* kernel) {
+    args_def_fn(reinterpret_cast<const PD_KernelKey*>(&kernel_key),
+                reinterpret_cast<PD_Kernel*>(kernel));
+  };
+  phi::KernelFn kernel_fn = [fn](phi::KernelContext* ctx) {
+    fn(reinterpret_cast<PD_KernelContext*>(ctx));
+  };
+  std::string kernel_name(kernel_name_cstr);
+
+  auto dtype = phi::capi::ToPhiDataType(pd_dtype);
+  auto layout = phi::capi::ToPhiDataLayout(pd_layout);
+  phi::KernelKey kernel_key(
+      paddle::experimental::StringToBackend(backend_cstr), layout, dtype);
+
+  phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
+  PD_KernelArgsParseFn(kernel_key,
+                       kernel.mutable_args_def(),
+                       in_nargs,
+                       in_args_type,
+                       attr_nargs,
+                       attr_args_type,
+                       out_nargs,
+                       out_args_type);
+
+  args_def_fn_wrapper(kernel_key, &kernel);
+  phi::KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+}
+
+PD_REGISTER_CAPI(kernel_registry);
diff --git a/paddle/phi/capi/lib/c_place.cc b/paddle/phi/capi/lib/c_place.cc
new file mode 100644
index 0000000000000..cccccbbb259f3
--- /dev/null
+++ b/paddle/phi/capi/lib/c_place.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_place.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/common/place.h"
+
+bool PD_PlaceIsHost(PD_Place* place) {
+  auto cc_place = reinterpret_cast<phi::Place*>(place);
+  return cc_place->GetType() == phi::AllocationType::CPU;
+}
+
+int8_t PD_PlaceGetDeviceId(PD_Place* place) {
+  auto cc_place = reinterpret_cast<phi::Place*>(place);
+  return cc_place->GetDeviceId();
+}
+
+PD_REGISTER_CAPI(place);
diff --git a/paddle/phi/capi/lib/c_scalar.cc b/paddle/phi/capi/lib/c_scalar.cc
new file mode 100644
index 0000000000000..655465c8f848f
--- /dev/null
+++ b/paddle/phi/capi/lib/c_scalar.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_scalar.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/common/scalar.h"
+
+PD_DataType PD_ScalarGetType(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return phi::capi::ToPDDataType(cc_scalar->dtype());
+}
+
+bool PD_ScalarGetBoolData(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<bool>();
+}
+
+int8_t PD_ScalarGetInt8Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int8_t>();
+}
+
+int16_t PD_ScalarGetInt16Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int16_t>();
+}
+
+int32_t PD_ScalarGetInt32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int32_t>();
+}
+
+int64_t PD_ScalarGetInt64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int64_t>();
+}
+
+uint8_t PD_ScalarGetUInt8Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint8_t>();
+}
+
+uint16_t PD_ScalarGetUInt16Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint16_t>();
+}
+
+uint32_t PD_ScalarGetUInt32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint32_t>();
+}
+
+uint64_t PD_ScalarGetUInt64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint64_t>();
+}
+
+float PD_ScalarGetFloat32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<float>();
+}
+
+double PD_ScalarGetFloat64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<double>();
+}
+
+PD_REGISTER_CAPI(scalar);
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
new file mode 100644
index 0000000000000..cd0bbd62d88a0
--- /dev/null
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_tensor.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+PD_DataType PD_TensorGetDataType(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return PD_DataType::UNDEFINED;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return phi::capi::ToPDDataType(cc_tensor->dtype());
+}
+
+PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor* tensor,
+                                     PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return PD_DataLayout::ALL_LAYOUT;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return phi::capi::ToPDDataLayout(cc_tensor->layout());
+}
+
+int64_t PD_TensorGetByteSize(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->memory_size();
+}
+
+void* PD_TensorGetDataPointer(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return const_cast<void*>(cc_tensor->data());
+}
+
+int64_t PD_TensorGetElementCount(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->numel();
+}
+
+int64_t PD_TensorGetNumDims(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->dims().size();
+}
+
+int64_t PD_TensorGetDim(const PD_Tensor* tensor,
+                        size_t index,
+                        PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->dims().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->dims()[index];
+}
+
+void PD_TensorGetLoD(const PD_Tensor* tensor,
+                     PD_List* data,
+                     PD_List* offset,
+                     PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || !data || !offset) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto lod = cc_tensor->lod();
+  offset->size = lod.size() + 1;
+  auto offset_data = new size_t[offset->size];
+  offset->data = offset_data;
+  offset_data[0] = 0;
+
+  size_t sz = 0;
+  for (size_t i = 0; i < lod.size(); ++i) {
+    offset_data[i + 1] = lod[i].size() + offset_data[i];
+    sz += lod[i].size();
+  }
+
+  auto data_ptr = new size_t[sz];
+  data->data = data_ptr;
+  data->size = sz;
+  for (size_t i = 0; i < lod.size(); ++i) {
+    memcpy(data_ptr, lod[i].data(), lod[i].size() * sizeof(size_t));
+    data_ptr += lod[i].size();
+  }
+}
+
+bool PD_TensorIsInitialized(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return false;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->initialized();
+}
+
+bool PD_TensorIsValid(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return false;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->valid();
+}
+
+void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->Holder().get();
+}
+
+void PD_TensorSetDims(PD_Tensor* tensor,
+                      int64_t ndims,
+                      const int64_t* dims,
+                      PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  std::vector<int> shape(dims, dims + ndims);
+  cc_tensor->Resize(phi::make_ddim(shape));
+}
+
+void PD_TensorSetDataType(PD_Tensor* tensor,
+                          PD_DataType dtype,
+                          PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_type(phi::capi::ToPhiDataType(dtype));
+}
+
+void PD_TensorSetDataLayout(PD_Tensor* tensor,
+                            PD_DataLayout layout,
+                            PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_layout(phi::capi::ToPhiDataLayout(layout));
+}
+
+void PD_TensorResetLoD(PD_Tensor* tensor,
+                       PD_List data,
+                       PD_List offset,
+                       PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  phi::LoD lod;
+  auto offset_ptr = static_cast<size_t*>(offset.data);
+  auto data_ptr = static_cast<size_t*>(data.data);
+
+  for (size_t i = 0; i < offset.size - 1; ++i) {
+    lod.emplace_back(data_ptr + offset_ptr[i], data_ptr + offset_ptr[i + 1]);
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->ResetLoD(lod);
+}
+
+PD_Tensor* PD_NewTensor() {
+  return reinterpret_cast<PD_Tensor*>(new phi::DenseTensor());
+}
+
+void PD_DeleteTensor(PD_Tensor* tensor) {
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  delete cc_tensor;
+}
+
+void PD_TensorShareDataWith(PD_Tensor* dst,
+                            const PD_Tensor* src,
+                            PD_Status* status) {
+  if (status) {
+    if (!dst || !src) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_dst_tensor = reinterpret_cast<phi::DenseTensor*>(dst);
+  auto cc_src_tensor = reinterpret_cast<const phi::DenseTensor*>(src);
+  cc_dst_tensor->ShareDataWith(*cc_src_tensor);
+}
+
+void PD_TensorShareLoDWith(PD_Tensor* dst,
+                           const PD_Tensor* src,
+                           PD_Status* status) {
+  if (status) {
+    if (!dst || !src) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_dst_tensor = reinterpret_cast<phi::DenseTensor*>(dst);
+  auto cc_src_tensor = const_cast<phi::DenseTensor*>(
+      reinterpret_cast<const phi::DenseTensor*>(src));
+
+  phi::MetaTensor meta_dst(cc_dst_tensor);
+  const phi::MetaTensor meta_src(cc_src_tensor);
+  meta_dst.share_lod(meta_src);
+}
+
+PD_REGISTER_CAPI(tensor);
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 25f222546656f..033c50e537da6 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -277,6 +277,12 @@ void set_constant(const paddle::platform::DeviceContext& context,
                   paddle::framework::Tensor* tensor,
                   float value) {
   TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (paddle::platform::is_custom_place(context.GetPlace())) {
+    func(phi::CPUPlace());
+    return;
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // tensor->place().apply_visitor(func);
   paddle::platform::VisitPlace(tensor->place(), func);
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc
new file mode 100644
index 0000000000000..f0ea48ed93595
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/custom_phi_kernel.h"
+
+namespace paddle {
+
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, INT8> for test
+// This test will fail when this kernel is supported in framework
+template <typename T>
+void DotKernel(const phi::Context& dev_ctx,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& y,
+               phi::DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  T* z = dev_ctx.template Alloc<T>(out);
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace custom_kernel
+}  // namespace paddle
+
+PD_BUILD_PHI_KERNEL(
+    dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {}
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
new file mode 100644
index 0000000000000..a94307161d431
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.fluid import core
+from distutils.sysconfig import get_python_lib
+from distutils.core import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+
+# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
+# Avoid a gcc warning below:
+# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
+# for C/ObjC but not for C++
+class BuildExt(build_ext):
+
+    def build_extensions(self):
+        if '-Wstrict-prototypes' in self.compiler.compiler_so:
+            self.compiler.compiler_so.remove('-Wstrict-prototypes')
+        super(BuildExt, self).build_extensions()
+
+
+# cc flags
+paddle_extra_compile_args = [
+    '-std=c++14',
+    '-shared',
+    '-fPIC',
+    '-Wno-parentheses',
+    '-DPADDLE_WITH_CUSTOM_KERNEL',
+]
+if core.is_compiled_with_npu():
+    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
+# include path
+site_packages_path = get_python_lib()
+paddle_custom_kernel_include = [
+    os.path.join(site_packages_path, 'paddle', 'include'),
+]
+# include path third_party
+compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
+                                        'build/third_party')
+paddle_custom_kernel_include += [
+    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
+    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
+    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
+]
+
+# libs path
+paddle_custom_kernel_library_dir = [
+    os.path.join(site_packages_path, 'paddle', 'fluid'),
+]
+
+# libs
+libs = [':core_avx.so']
+if not core.has_avx_core and core.has_noavx_core:
+    libs = [':core_noavx.so']
+
+custom_kernel_dot_module = Extension(
+    'custom_kernel_dot',
+    sources=['custom_kernel_dot_c.cc'],
+    include_dirs=paddle_custom_kernel_include,
+    library_dirs=paddle_custom_kernel_library_dir,
+    libraries=libs,
+    extra_compile_args=paddle_extra_compile_args)
+
+setup(name='custom_kernel_dot_c',
+      version='1.0',
+      description='custom kernel fot compiling',
+      cmdclass={'build_ext': BuildExt},
+      ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
index d1929fef5cc54..e28bfe00e7c4f 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
@@ -56,6 +56,42 @@ def tearDown(self):
         del os.environ['CUSTOM_DEVICE_ROOT']
 
 
+class TestCustomKernelDotC(unittest.TestCase):
+
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_c_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
+
+    def test_custom_kernel_dot_run(self):
+        # test dot run
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(
+                out.numpy(), result))
+
+    def tearDown(self):
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):
         # only support Linux now
diff --git a/python/setup.py.in b/python/setup.py.in
index bb6416038f198..8b6a456865176 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -614,6 +614,8 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) +  # phi infermeta headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
+    # capi headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) +  # phi capi headers
     # utila api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers