In [1]:
import numpy as np
from ml_dtypes import bfloat16

import npu
from npu.build.appbuilder import AppBuilder
from npu.build.mtkernel import MTPassThrough
from npu.build.itkernel import ITWrite
from npu.build.kernel import Kernel
from npu.build.port import BufferPort, RTPPort

In [2]:
class SingleKernelApp(AppBuilder):
    def __init__(self, kernelfx):

        if isinstance(kernelfx, type):
            self.kernelfx = kernelfx()
        elif isinstance(kernelfx, Kernel) and callable(kernelfx):
            self.kernelfx = kernelfx
        else:
            raise Exception("kernelfx must be a Kernel or a Kernel instance")

        super().__init__()

    def callgraph(self, *args, **kwargs):
        return self.kernelfx(*args, **kwargs)

In [3]:
%%kernel

void passthrough(bfloat16 *in_buffer, bfloat16 *out_buffer, uint32_t nbytes)
{
    for(int i=0; i<nbytes/2; i++) {
        out_buffer[i] = in_buffer[i];
    }
}

In [4]:
def behavioralfx(self):
    self.out_buffer.array = self.in_buffer.array

In [5]:
passthrough.behavioralfx = behavioralfx

In [6]:
a = np.ones((8,), dtype=bfloat16)
b = np.zeros((8,), dtype=bfloat16)

In [7]:
app = SingleKernelApp(passthrough)

In [8]:
app(a, a.nbytes)

array([1, 1, 1, 1, 1, 1, 1, 1], dtype=bfloat16)

In [9]:
app.build(a, a.nbytes)

Building the passthrough kernel...
Building the xclbin...
Successfully Building Application... SingleKernelApp.xclbin & SingleKernelApp.seq delivered


In [11]:
from npu.runtime import AppRunner

In [12]:
runner = AppRunner('SingleKernelApp.xclbin')

In [14]:
input_buffer = runner.allocate(shape=a.shape, dtype=bfloat16)
output_buffer = runner.allocate(shape=b.shape, dtype=bfloat16)

input_buffer[:] = a
input_buffer.sync_to_npu()

In [15]:
runner.call(input_buffer, output_buffer)

output_buffer.sync_from_npu()

In [16]:
output_buffer

PynqBuffer([1, 1, 1, 1, 1, 1, 1, 1], dtype=bfloat16)