# Padding

In [1]:
import d2ltvm
import tvm
from tvm import te
import numpy as np

successd...



In [2]:
def padding(X, ph, pw, val=0):
    """ Pad X with the given value in 2-D
        ph, pw : height and width padding
        val : padding value, default 0
    """
    assert len(X.shape) >= 2
    nh, nw = X.shape[-2], X.shape[-1]
    return te.compute((*X.shape[0:-2], nh + 2 * ph, nw + 2 * pw), 
                      lambda *i: te.if_then_else(
                          te.any(i[-2] < ph, i[-2] >= ph + nh, i[-1] < pw, i[-1] >= pw + nw), 
                          val, X[i[:-2] + (i[-2] - ph, i[-1] - pw)]),
                     name='PaddedX')

In [3]:
A = te.placeholder((2,3,4), name='A')
ph = te.var(name='ph')
pw = te.var(name='pw')
B = padding(A, ph, pw)
s = te.create_schedule(B.op)
tvm.lower(s, [A, B]).show()

In [4]:
mod = tvm.build(s, [A, B, ph, pw])
ph_1 = 1
pw_1 = 1
a = tvm.nd.array(np.ones((2,3,4)).astype('float32'))
b = tvm.nd.array(np.empty((2,5,6)).astype('float32'))
mod(a, b, ph_1, pw_1)
print(b)

[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 1. 1. 1. 1. 0.]
  [0. 0. 0. 0. 0. 0.]]]


# Convolution

In [5]:
def conv_out_size(n, k, p, s):
    """ Compute the output size by given input size n (width or height),
        kernel size k, padding p, and stride s
        Return output size (width or height)
    """
    return (n + 2 * p - k) // s + 1

In [27]:
def conv(oc, ic, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1):
    """ Convolution
        oc, ic : output and input channels
        nh, nw : input width and height
        kh, kw : kernel width and height
        ph, pw : height and width padding sizes, default 0
        sh, sw : height and width strides, default 1
    """
    ric = te.reduce_axis((0, ic), name='ric')
    rkh = te.reduce_axis((0, kh), name='rkh')
    rkw = te.reduce_axis((0, kw), name='rkw')
    
    oh = conv_out_size(nh, kh, ph, sh)
    ow = conv_out_size(nw, kw, pw, sw)
    
    # pad X and then compute Y    
    X = te.placeholder((ic, nh, nw), name='X')
    K = te.placeholder((oc, ic, kh, kw), name='K')
    
    PaddedX = padding(X, ph, pw) if ph * pw != 0 else X
    Y = te.compute((oc, oh, ow), 
            lambda c, i, j: te.sum(PaddedX[ric, i * sh + rkh, j * sw + rkw] * K[c, ric, rkh, rkw], 
                    axis=[ric, rkh, rkw]), name='Y')
    return X, K, Y, PaddedX

In [28]:
def get_conv_data(oc, ic, n, k, p, s=1, constructor=None):
    """ Return random 3-D data tensor, 3-D kernel tenor and empty 3-D output
        tensor with the shapes specified by input arguments.
        oc, ic : output and input channels
        n : input width and height
        k : kernel width and height
        p : padding size, default 0
        s : stride, default 1
        constructor : user-defined tensor constructor
    """
    np.random.seed(0)
    data = np.random.normal(size=(ic, n, n)).astype('float32')
    weight = np.random.normal(size=(oc, ic, k, k)).astype('float32')
    on = conv_out_size(n, k, p, s)
    out = np.empty(shape=(oc, on, on)).astype('float32')
    if constructor is not None:
        data, weight, out = [constructor(i) for i in (data, weight, out)]
    
    return data, weight, out

In [29]:
oc, ic, n, k, p, s = 4, 6, 12, 3, 1, 1
X, K, Y, _ = conv(oc, ic, n, n, k, k, p, p, s, s)
sch = te.create_schedule(Y.op)
mod = tvm.build(sch, [X, K, Y])
print(tvm.lower(sch, [X, K, Y], simple_mode=True))
data, weight, out = get_conv_data(oc, ic, n, k, p, s, constructor=tvm.nd.array)
mod(data, weight, out)

@main = primfn(X_1: handle, K_1: handle, Y_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {X: Buffer(X_2: Pointer(float32), float32, [864], []),
             K: Buffer(K_2: Pointer(float32), float32, [216], []),
             Y: Buffer(Y_2: Pointer(float32), float32, [576], [])}
  buffer_map = {X_1: X, K_1: K, Y_1: Y}
  preflattened_buffer_map = {X_1: X_3: Buffer(X_2, float32, [6, 12, 12], []), K_1: K_3: Buffer(K_2, float32, [4, 6, 3, 3], []), Y_1: Y_3: Buffer(Y_2, float32, [4, 12, 12], [])} {
  allocate(PaddedX: Pointer(global float32), float32, [1176]), storage_scope = global {
    for (i0: int32, 0, 6) {
      for (i1: int32, 0, 14) {
        for (i2: int32, 0, 14) {
          PaddedX_1: Buffer(PaddedX, float32, [1176], [])[(((i0*196) + (i1*14)) + i2)] = @tir.if_then_else(((((i1 < 1) || (13 <= i1)) || (i2 < 1)) || (13 <= i2)), 0f32, X[((((i0*144) + (i1*12)) + i2) - 13)], dtype=float32)
        }
      }
    }
    

In [30]:
import torch
def get_conv_data_torch(oc, ic, n, k, p, s, ctx='cpu'):
    ctx = torch.device(ctx)
    data, weight, out = get_conv_data(oc, ic, n, k, p, s, lambda x: torch.tensor(x, device=ctx))
    data, out = data[None, ...], out[None, ...]
    bias = torch.zeros(out.shape[1], device=ctx)
    return data, weight, bias, out

In [31]:
def conv_torch(data, weight, bias, k, p, s):
    return torch.nn.functional.conv2d(data, weight, bias=bias, stride=s, padding=p)

In [34]:
import torch.nn as nn
conv1 = nn.Conv2d(oc, ic, k)
data1, weight1, bias1, out_torch1 = get_conv_data_torch(oc, ic, n, k, p, s)
out_torch = conv_torch(data1, weight1, bias1, k, p, s)
np.testing.assert_allclose(out.asnumpy(), out_torch[0].numpy(), atol=1e-5)

# Summary
1.We can express the computation of 2-D convolution in TVM in a fairly easy way. \
2.Deep learning workloads normally operate 2-D convolution on 4-D data tensors and kernel tensors. \
3.The naive 2-D convolution is a 6-level nested for loop.