In [1]:
import numpy as np
from ml_dtypes import bfloat16

from npu.build.appbuilder import AppBuilder
from npu.build.mtkernel import MTPassThrough
from npu.build.itkernel import ITWrite
from npu.build.kernel import Kernel
from npu.build.port import BufferPort, RTPPort

In [2]:
class SingleKernelApp(AppBuilder):
    def __init__(self, kernelfx):

        if isinstance(kernelfx, type):
            self.kernelfx = kernelfx()
        elif isinstance(kernelfx, Kernel) and callable(kernelfx):
            self.kernelfx = kernelfx
        else:
            raise Exception("kernelfx must be a Kernel or a Kernel instance")

        super().__init__()

    def callgraph(self, *args, **kwargs):
        return self.kernelfx(*args, **kwargs)

In [3]:
%%kernel

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <aie_api/aie.hpp>

void leaky_relu(bfloat16 *in_buffer, bfloat16 *out_buffer, uint32_t nbytes)
{
    auto bfloat_zeroes = aie::zeros<bfloat16, 32>();
    auto bfloat_alpha = aie::broadcast<bfloat16, 32>(0.01);
    
    // divide nbytes by 2 because we're operating on 16-bit datatype
    for(int i=0; i<nbytes/2; i+=32) {
        
        ::aie::vector<bfloat16, 32> in1 = ::aie::load_v<32>(in_buffer);
        in_buffer += 32;

        ::aie::vector<bfloat16, 32> relu_out = aie::max(in1, bfloat_zeroes);
        auto leaky_out_accum = aie::mul(in1, bfloat_alpha);
        ::aie::vector<bfloat16, 32> leaky_out = leaky_out_accum.to_vector<bfloat16>();

        ::aie::mask<32> mask = aie::lt(in1, bfloat_zeroes);

        ::aie::vector<bfloat16, 32> out = aie::select(relu_out, leaky_out, mask);

        ::aie::store_v(out_buffer, out);
        out_buffer += 32;
    }
}

In [4]:
def behavioralfx(self):
    # self.out_buffer.array = np.maximum(self.in_buffer.array, 0)
    x = self.in_buffer.array
    self.out_buffer.array = np.where(x > 0, x, x * 0.01)

leaky_relu.behavioralfx = behavioralfx

In [5]:
a = np.arange(8, dtype=bfloat16)-4
b = np.zeros((8,), dtype=bfloat16)

In [6]:
app = SingleKernelApp(leaky_relu)
app(a, a.nbytes)

array([-0.04, -0.03, -0.02, -0.01,  0.  ,  1.  ,  2.  ,  3.  ],
      dtype=float32)

In [7]:
app.build(a, a.nbytes)

Using cached leaky_relu kernel object file...
Building the xclbin...
Successfully Building Application... SingleKernelApp.xclbin & SingleKernelApp.seq delivered


In [8]:
from npu.runtime import AppRunner

In [9]:
runner = AppRunner('SingleKernelApp.xclbin')

In [10]:
input_buffer = runner.allocate(shape=a.shape, dtype=bfloat16)
output_buffer = runner.allocate(shape=b.shape, dtype=bfloat16)

In [11]:
input_buffer[:] = np.arange(8, dtype=bfloat16)-4
input_buffer.sync_to_npu()

In [12]:
runner.call(input_buffer, output_buffer)
output_buffer.sync_from_npu()

In [13]:
input_buffer

PynqBuffer([-4, -3, -2, -1, 0, 1, 2, 3], dtype=bfloat16)

In [14]:
output_buffer

PynqBuffer([-0.0397949, -0.0299072, -0.0198975, -0.00994873, 0, 1, 2, 3],
           dtype=bfloat16)