<a href="https://colab.research.google.com/github/rajshekharM/test/blob/master/TVM_VTA_Matrix_Multiply.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
    ! gsutil cp "gs://tvm-fcrc-binaries-7f775516ff9dfab922c304049f294cec/tvm.tar.gz" /tmp/tvm.tar.gz
    ! mkdir -p /tvm
    ! tar -xf /tmp/tvm.tar.gz --strip-components=4 --directory /tvm
    ! ls -la /tvm
    ! bash /tvm/package.sh
    # Add TVM to the Python path.
    import sys
    sys.path.append('/tvm/python')
    sys.path.append('/tvm/topi/python')
    sys.path.append('/tvm/nnvm/python')
    sys.path.append('/tvm/vta/python')
else:
    print("Notebook executing locally, skipping Colab setup ...")

Copying gs://tvm-fcrc-binaries-7f775516ff9dfab922c304049f294cec/tvm.tar.gz...
\ [1 files][114.5 MiB/114.5 MiB]                                                
Operation completed over 1 objects/114.5 MiB.                                    
total 164
drwxr-xr-x 21 root root  4096 Nov  8 20:03 .
drwxr-xr-x  1 root root  4096 Nov  8 20:03 ..
drwx------  8 root root  4096 Jun 11 23:37 3rdparty
drwx------ 12 root root  4096 Jun 11 23:37 apps
drwx------  3 root root  4096 Jun 12 00:02 build
drwx------  4 root root  4096 Jun 11 23:37 cmake
-rw-------  1 root root 10406 Jun 11 23:37 CMakeLists.txt
drwx------  6 root root  4096 Jun 11 23:37 conda
-rw-------  1 root root  5673 Jun 11 23:37 CONTRIBUTORS.md
drwx------  3 root root  4096 Jun 11 23:37 docker
drwx------ 11 root root  4096 Jun 11 23:37 docs
drwx------  4 root root  4096 Jun 11 23:37 golang
drwx------  3 root root  4096 Jun 11 23:37 include
-rw-------  1 root root 10542 Jun 11 23:37 Jenkinsfile
drwx------  6 root root  4096 Jun 11 23:

In [2]:
from __future__ import absolute_import, print_function

import os
import tvm
import vta
import numpy as np
from tvm import rpc
from tvm.contrib import util
from vta.testing import simulator

# Load VTA parameters from the vta/config/vta_config.json file
env = vta.get_env()

# Print the target specified in the vta_config.json
print(env.TARGET)

sim


In [0]:
# We read the Pynq RPC host IP address and port number from the OS environment
host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))

# We configure both the bitstream and the runtime system on the Pynq
# to match the VTA configuration specified by the vta_config.json file.
if env.TARGET == "pynq":

    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)

    # Reconfigure the JIT runtime
    vta.reconfig_runtime(remote)

    # Program the FPGA with a pre-compiled VTA bitstream.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    vta.program_fpga(remote, bitstream=None)

# In simulation mode, host the RPC server locally.
elif env.TARGET == "sim":
    remote = rpc.LocalSession()

In [5]:
# Let's look at the specific dimensions of our GEMM core
print("Weights are {} by {}, of type {}".format(
    env.BLOCK_OUT, env.BLOCK_IN, env.wgt_dtype))
print("Input activations are {} by {}, of type {}".format(
    env.BATCH, env.BLOCK_IN, env.inp_dtype))
print("Output activations are {} by {}, of type {}".format(
    env.BATCH, env.BLOCK_OUT, env.acc_dtype))

Weights are 16 by 16, of type int8
Input activations are 1 by 16, of type int8
Output activations are 1 by 16, of type int32


In [6]:

# We can also derive throughput requirements for each memory
print("Weight buffer requires {} bits per cycle of read xput".format(
    env.BLOCK_OUT * env.BLOCK_IN * env.WGT_WIDTH))
print("Input buffer requires {} bits per cycle of read xput".format(
    env.BATCH * env.BLOCK_IN * env.INP_WIDTH))
print("Register file requires {} bits per cycle of read & write xput".format(
    env.BATCH * env.BLOCK_OUT * env.ACC_WIDTH))

Weight buffer requires 2048 bits per cycle of read xput
Input buffer requires 128 bits per cycle of read xput
Register file requires 512 bits per cycle of read & write xput


In [0]:
# Output channel factor m - total 4=256 output channels
m = 2
# Input channel factor n - total 4=256 input channels
n = 2
# Batch factor o (we use single batch inference)
o = 1
# A placeholder tensor in tiled data format
A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN),
                    name="A", dtype=env.inp_dtype)
# B placeholder tensor in tiled data format
B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                    name="B", dtype=env.wgt_dtype)
# A copy buffer
A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN),
                    lambda *i: A(*i), "A_buf")
# B copy buffer
B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                    lambda *i: B(*i), "B_buf")

In [0]:
# Outer input feature reduction axis
ko = tvm.reduce_axis((0, n), name="ko")
# Inner input feature reduction axis
ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
# Describe the in-VTA matrix multiplication
C_buf = tvm.compute(
    (o, m, env.BATCH, env.BLOCK_OUT),
    lambda bo, co, bi, ci:
        tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                B_buf[co, ko, ci, ki].astype(env.acc_dtype),
                axis=[ko, ki]),
    name="C_buf")

In [0]:
# Cast to output type, and send to main memory
C = tvm.compute(
    (o, m, env.BATCH, env.BLOCK_OUT),
    lambda *i: C_buf(*i).astype(env.inp_dtype),
    name="C")

In [10]:
# Let's take a look at the generated schedule
s = tvm.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

// attr [A_buf] storage_scope = "global"
allocate A_buf[int8 * 32]
// attr [B_buf] storage_scope = "global"
allocate B_buf[int8 * 1024]
// attr [C_buf] storage_scope = "global"
allocate C_buf[int32 * 32]
produce A_buf {
  for (i1, 0, 2) {
    for (i3, 0, 16) {
      A_buf[((i1*16) + i3)] = A[((i1*16) + i3)]
    }
  }
}
produce B_buf {
  for (i0, 0, 2) {
    for (i1, 0, 2) {
      for (i2, 0, 16) {
        for (i3, 0, 16) {
          B_buf[((((((i0*2) + i1)*16) + i2)*16) + i3)] = B[((((((i0*2) + i1)*16) + i2)*16) + i3)]
        }
      }
    }
  }
}
produce C_buf {
  for (co, 0, 2) {
    for (ci, 0, 16) {
      C_buf[((co*16) + ci)] = 0
      for (ko, 0, 2) {
        for (ki, 0, 16) {
          C_buf[((co*16) + ci)] = (C_buf[((co*16) + ci)] + (int32(A_buf[((ko*16) + ki)])*int32(B_buf[((((((co*2) + ko)*16) + ci)*16) + ki)])))
        }
      }
    }
  }
}
produce C {
  for (i1, 0, 2) {
    for (i3, 0, 16) {
      C[((i1*16) + i3)] = int8(C_buf[((i1*16) + i3)])
    }
  }
}



In [11]:
# Set the intermediate tensor's scope to VTA's on-chip buffers
s[A_buf].set_scope(env.inp_scope)
s[B_buf].set_scope(env.wgt_scope)
s[C_buf].set_scope(env.acc_scope)

stage(C_buf, 0x3078470)

In [12]:
# Move buffer copy into matrix multiply loop
s[A_buf].compute_at(s[C_buf], ko)
s[B_buf].compute_at(s[C_buf], ko)

# Tag the buffer copies with the DMA pragma to insert a DMA transfer
s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
s[C].pragma(s[C].op.axis[0], env.dma_copy)

# Let's take a look at the transformed schedule
print(tvm.lower(s, [A, B, C], simple_mode=True))

// attr [C_buf] storage_scope = "local.acc_buffer"
// attr [A_buf] storage_scope = "local.inp_buffer"
// attr [B_buf] storage_scope = "local.wgt_buffer"
produce C_buf {
  for (co, 0, 2) {
    for (ci, 0, 16) {
      C_buf[((co*16) + ci)] = 0
      for (ko, 0, 2) {
        produce A_buf {
          // attr [iter_var(i0, )] pragma_dma_copy = 1
          for (i3, 0, 16) {
            A_buf[i3] = A[((ko*16) + i3)]
          }
        }
        produce B_buf {
          // attr [iter_var(i0, )] pragma_dma_copy = 1
          for (i3, 0, 16) {
            B_buf[i3] = B[((((((co*2) + ko)*16) + ci)*16) + i3)]
          }
        }
        for (ki, 0, 16) {
          C_buf[((co*16) + ci)] = (C_buf[((co*16) + ci)] + (int32(A_buf[ki])*int32(B_buf[ki])))
        }
      }
    }
  }
}
produce C {
  // attr [iter_var(i0, )] pragma_dma_copy = 1
  for (i1, 0, 2) {
    for (i3, 0, 16) {
      C[((i1*16) + i3)] = int8(C_buf[((i1*16) + i3)])
    }
  }
}



In [13]:
s[C_buf].reorder(
    ko,
    s[C_buf].op.axis[0],
    s[C_buf].op.axis[1],
    s[C_buf].op.axis[2],
    s[C_buf].op.axis[3],
    ki)
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

// attr [C_buf] storage_scope = "local.acc_buffer"
// attr [A_buf] storage_scope = "local.inp_buffer"
// attr [B_buf] storage_scope = "local.wgt_buffer"
produce C_buf {
  // attr [iter_var(vta, , vta)] coproc_scope = 2
  // attr [iter_var(vta, , vta)] coproc_uop_scope = "VTAPushGEMMOp"
  VTAUopLoopBegin(2, 1, 0, 0)
  VTAUopPush(0, 1, 0, 0, 0, 0, 0, 0)
  VTAUopLoopEnd()
  vta.coproc_dep_push(2, 1)
  for (ko, 0, 2) {
    // attr [iter_var(vta, , vta)] coproc_scope = 1
    vta.coproc_dep_pop(2, 1)
    produce A_buf {
      VTALoadBuffer2D(tvm_thread_context(VTATLSCommandHandle()), A, ko, 1, 1, 1, 0, 0, 0, 0, 0, 2)
    }
    produce B_buf {
      VTALoadBuffer2D(tvm_thread_context(VTATLSCommandHandle()), B, ko, 1, 2, 2, 0, 0, 0, 0, 0, 1)
    }
    vta.coproc_dep_push(1, 2)
    // attr [iter_var(vta, , vta)] coproc_scope = 2
    vta.coproc_dep_pop(1, 2)
    // attr [iter_var(vta, , vta)] coproc_uop_scope = "VTAPushGEMMOp"
    VTAUopLoopBegin(2, 1, 0, 1)
    VTAUopPush(0, 0, 0, 0, 0, 0, 0, 0

In [0]:
# Build GEMM VTA kernel (set debug flags)
with vta.build_config(debug_flag = 0x6):
    my_gemm = tvm.build(s, [A, B, C], "ext_dev",
                        env.target_host, name="my_gemm")

# Write the compiled module into an object file.
temp = util.tempdir()
my_gemm.save(temp.relpath("gemm.o"))

# Send the executable over RPC
remote.upload(temp.relpath("gemm.o"))

# Load the compiled module
f = remote.load_module("gemm.o")

In [0]:
# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the A and B arrays randomly in the int range of (-128, 128]
A_orig = np.random.randint(
    -128, 128,
    size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
B_orig = np.random.randint(
    -128, 128,
    size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)

# Apply packing to the A and B arrays from a 2D to a 4D packed layout
A_packed = A_orig.reshape(
    o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
B_packed = B_orig.reshape(
    m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
A_nd = tvm.nd.array(A_packed, ctx)
B_nd = tvm.nd.array(B_packed, ctx)
C_nd = tvm.nd.array(
    np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)

# Invoke the module to perform the computation
f(A_nd, B_nd, C_nd)

In [16]:
# Compute reference result with numpy
C_ref = np.dot(A_orig.astype(env.acc_dtype),
               B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
C_ref = C_ref.reshape(
    o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
np.testing.assert_equal(C_ref, C_nd.asnumpy())
print("Successful matrix multiply test!")

Successful matrix multiply test!


**Summary**

This tutorial showcases the TVM workflow to implement a simple matrix multiplication example on VTA. The general workflow includes:

Programming the FPGA with the VTA bitstream over RPC.
Describing matrix multiplication via a series of computations.
Describing how we want to perform the computation using schedule primitives.
Compiling the function to the VTA target.
Running the compiled module and verifying it against a numpy implementation.