In [1]:
import tvm
from tvm import te
import numpy as np

In [2]:
# TVM setup
target = tvm.target.Target(target="llvm", host="llvm")
device = tvm.device(target.kind.name, 0)

In [3]:
# Kernel that computes element-wise A*B+C
#
# In C one could write that kernel in at least two ways:
#
#   for (int i = 0; i < 2; ++i) 
#     for (int j = 0; j < 2; ++j)
#        MUL[i][j] = A[i][j] * B[i][j];
#
#   for (int i = 0; i < 2; ++i) 
#     for (int j = 0; j < 2; ++j)
#        ADD[i][j] = MUL[i][j] + C[i][j];
#
# Or the same thing fused:
#
#   for (int i = 0; i < 2; ++i) 
#     for (int j = 0; j < 2; ++j)
#        MAC[i][j] = A[i][j] * B[i][j] + C[i][j];
#

m=2
n=2
 
A = te.placeholder((m,n), name="A")
B = te.placeholder((m,n), name="B")
C = te.placeholder((m,n), name="C")

MUL = te.compute(
    shape=[m,n],
    fcompute=lambda i,j: A[i,j]*B[i,j],
    name="MUL")

ADD = te.compute(
    shape=[m,n],
    fcompute=lambda i,j: MUL[i,j]+C[i,j],
    name="ADD")

schedule = te.create_schedule(ADD.op)
mac = tvm.build(schedule, [A, B, C, ADD], target, name="mac")

In [4]:
# Simple test for mac kernel

A = tvm.nd.array(np.array(
    [[1,1],[1,1]]).astype(A.dtype), device)

B = tvm.nd.array(np.array(
    [[2,2],[2,2]]).astype(B.dtype), device)

C = tvm.nd.array(np.array(
    [[3,3],[3,3]]).astype(C.dtype), device)

result = tvm.nd.array(np.array(
    [[0,0],[0,0]]).astype(C.dtype), device)

mac(A, B, C, result)

A, B, C, result

(<tvm.nd.NDArray shape=(2, 2), cpu(0)>
 array([[1., 1.],
        [1., 1.]], dtype=float32),
 <tvm.nd.NDArray shape=(2, 2), cpu(0)>
 array([[2., 2.],
        [2., 2.]], dtype=float32),
 <tvm.nd.NDArray shape=(2, 2), cpu(0)>
 array([[3., 3.],
        [3., 3.]], dtype=float32),
 <tvm.nd.NDArray shape=(2, 2), cpu(0)>
 array([[5., 5.],
        [5., 5.]], dtype=float32))