In [1]:
from npu.build.appbuilder import AppBuilder
from npu.build.kernel import Kernel
from npu.runtime import AppRunner
import numpy as np

class SingleKernelApp(AppBuilder):
    def __init__(self, kernelfx, inputargs, outputargs):
        self.kernelfx = kernelfx
        self.inputargs = inputargs
        self.args = inputargs + outputargs
        self.bufs = [b for b in self.args if isinstance(b, np.ndarray)]
        self.app = None
        super().__init__()

    def callgraph(self, *args, **kwargs):
        return self.kernelfx(*args, **kwargs)

    def build(self):
        super().build(*self.inputargs)
    
    def run(self):
        if self.app is None:
            self.app = AppRunner('SingleKernelApp.xclbin') 

        appbufs = [self.app.allocate(shape=b.shape, dtype=b.dtype) for b in self.bufs]
        _ = [np.copyto(appbufs[ix], b) for ix,b in enumerate(self.bufs)]
        _ = [b.sync_to_npu() for b in appbufs]

        self.app.call(*appbufs)

        _ = [b.sync_from_npu() for b in appbufs]
        _ = [print(b) for b in appbufs]        

In [2]:
%%kernel

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <aie_api/aie.hpp>

extern "C" {
  void vectoradd(uint8_t* a, uint8_t* b, uint8_t* c, const uint32_t nbytes) {

    ::aie::vector<uint8_t, 32> ai, bi, ci;

    for(int j=0; j<nbytes; j+=32) {
        ai = ::aie::load_v<32>(a);
        a += 32;
        bi = ::aie::load_v<32>(b);
        b += 32;
        ci = ::aie::add(ai, bi);
        ::aie::store_v(c, ci);
        c += 32;
    }
 }
}


In [3]:
a = np.random.randint(0, 256, size=4096, dtype=np.uint8)
b = np.random.randint(0, 256, size=4096, dtype=np.uint8)
c = np.zeros(4096, dtype=np.uint8)

In [4]:
vectoradd.c.array = c   # So we don't have to write a behavioral model, just set the C array dims
vaddapp = SingleKernelApp(vectoradd, inputargs=[a, b, a.nbytes], outputargs=[c])

In [5]:
vaddapp.build()

Using cached vectoradd kernel object file...
Building the xclbin...
Successfully Building Application... SingleKernelApp.xclbin & SingleKernelApp.seq delivered


In [6]:
vaddapp.run()

[ 77  36 249 ... 144  14  48]
[207  22  78 ...  41  92  74]
[ 28  58  71 ... 185 106 122]


In [7]:
del vaddapp.app

In [12]:
from npu.lib.kernels import Inverse
inv = Inverse()
print(inv.srccode)

// Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <aie_api/aie.hpp>

extern "C" {

void inverse(uint8_t *in_buffer, uint8_t* out_buffer, uint32_t nbytes) {
    ::aie::vector<uint8_t, 32> buffer;
    ::aie::vector<uint8_t, 32> inverted_buffer;
    uint16_t loop_count = (nbytes) >> 5;
    for(int j=0; j<loop_count; j++) {
        buffer = ::aie::load_v<32>(in_buffer);
        inverted_buffer = ::aie::sub((uint8_t)255, buffer);
        in_buffer += 32;
        ::aie::store_v((uint8_t*)out_buffer, inverted_buffer);
        out_buffer += 32;
    }
} 

}


In [13]:
invapp = SingleKernelApp(inv, inputargs=[a, a.nbytes], outputargs=[b])

In [14]:
invapp.build()

Using cached inverse kernel object file...
Building the xclbin...
Successfully Building Application... SingleKernelApp.xclbin & SingleKernelApp.seq delivered


In [15]:
invapp.run()

[ 77  36 249 ... 144  14  48]
[178 219   6 ... 111 241 207]


In [16]:
del invapp