# Training some neural net on mnist with pure tvm

Here we are going to train a neural net in pure tvm using tvm-level automatic differentiation.

In [1]:
import topi
import tvm

import numpy as np

In [None]:
topi.reshape()

In [2]:
batch_size = 128
num_classes = 10

import keras
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Batch generator. The last incomplete batch is thrown out because nnvm uses fixed batch size. We use the same function for keras so that the training results are closer.

In [3]:
def batches(x, y):
    for i in range(int(x.shape[0] / batch_size)):
        yield (x[i:i+batch_size, ...].astype('float32'),
               y[i:i+batch_size, ...].astype('float32'))

## Defining the model

This is the keras definition of the model:

In [4]:
keras_model = keras.models.Sequential()
keras_model.add(keras.layers.Reshape((28, 28, 1), input_shape=(28, 28)))
keras_model.add(keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'))
keras_model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
keras_model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
keras_model.add(keras.layers.Flatten())
keras_model.add(keras.layers.Dense(128, activation='relu'))
keras_model.add(keras.layers.Dense(num_classes, activation='softmax'))

keras_model.compile(loss='categorical_crossentropy',
                    optimizer=keras.optimizers.SGD(lr=1e-2))

This is the same thing written in tvm.

In [5]:
x = tvm.placeholder((batch_size, 28, 28))
y = tvm.placeholder((batch_size, num_classes))

w1 = tvm.placeholder((32, 1, 3, 3))
b1 = tvm.placeholder((32,))
w2 = tvm.placeholder((64, 32, 3, 3))
b2 = tvm.placeholder((64,))
w3 = tvm.placeholder((128, 9216))
b3 = tvm.placeholder((128,))
w4 = tvm.placeholder((num_classes, 128))
b4 = tvm.placeholder((num_classes,))

t = topi.reshape(x, (batch_size, 1, 28, 28))
t = topi.nn.relu(topi.nn.conv2d(t, w1, 1, 0) + topi.reshape(b1, (1, 32, 1, 1)))
t = topi.nn.relu(topi.nn.conv2d(t, w2, 1, 0) + topi.reshape(b2, (1, 64, 1, 1)))
t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max')
t = topi.nn.flatten(t)
t = topi.nn.relu(topi.nn.dense(t, w3, b3))
t = topi.nn.dense(t, w4, b4)

predictions = topi.nn.softmax(t)
loss = - topi.sum(y * topi.nn.log_softmax(t)) / batch_size

In [6]:
weights = [w1, b1, w2, b2, w3, b3, w4, b4]

In [15]:
head = topi.full((1,), 'float32', 1.0)
gradients = [tvm.ir_pass.JacobianRecursive(loss, w, head) for w in weights]
learning_rate = tvm.placeholder(())
new_weights = [w - learning_rate*g for w, g in zip(weights, gradients)]

## Compiling and initializing the weights

In [26]:
def get_shape(tensor):
    return [s.value for s in tensor.shape]

def empty_val(tensor):
    if isinstance(tensor, list):
        return [empty_val(t) for t in tensor]
    else:
        return tvm.nd.empty(get_shape(tensor), tensor.dtype)

In [27]:
weights_values = empty_val(weights)

In [28]:
testing_module = tvm.build(tvm.create_schedule(loss.op), [loss, x, y] + weights)

def tvm_test(xval, yval):
    args = [empty_val(loss)] + [tvm.ndarray.array(xval), tvm.ndarray.array(yval)] + weights_values
    testing_module(*args)
    return args[0].asnumpy()

In [32]:
xx, yy = next(batches(x_train, y_train))
tvm_test(xx, yy)

array(nan, dtype=float32)

In [6]:
training_cgraph, training_libmod, training_params = nnvm.compiler.build(training_graph, 'llvm')
training_module = tvm.contrib.graph_runtime.create(training_cgraph, training_libmod, tvm.cpu(0))

if training_params:
    training_module.set_input(**training_params)

testing_cgraph, testing_libmod, testing_params = nnvm.compiler.build(testing_graph, 'llvm')
testing_module = tvm.contrib.graph_runtime.create(testing_cgraph, testing_libmod, tvm.cpu(0))

if testing_params:
    testing_module.set_input(**testing_params)

Randomly initialize weights.

In [7]:
np.random.seed(42)

shapes = training_graph.apply('InferShape').json_attr('shape')
    
for v in loss.list_input_variables():
    shape = shapes[training_graph.index.node_id(v.attr('name'))]
    print("Initializing " + str(v.attr('name')) + " " + str(shape))
    if 'bias' in str(v.attr('name')):
        training_module.set_input(v.attr('name'), np.zeros(shape).astype('float32'))
    else:
        training_module.set_input(v.attr('name'), np.random.normal(scale=0.05, size=shape).astype('float32'))

Initializing y [128, 10]
Initializing x [128, 28, 28]
Initializing dense0_weight [544, 784]
Initializing dense0_bias [544]
Initializing dense1_weight [512, 544]
Initializing dense1_bias [512]
Initializing dense2_weight [10, 512]
Initializing dense2_bias [10]


Auxiliary functions, one returns the weights from the training graph, the other assigns these weights to the keras model.

In [8]:
def get_weights():
    for v in weight_vars:
        shape = shapes[training_graph.index.node_id(v.attr('name'))]
        yield v.attr('name'), training_module.get_input(v.attr('name'), tvm.nd.empty(shape))

def assign_nnvm_weights_to_keras():
    keras_model.set_weights([np.transpose(v.asnumpy()) for _, v in get_weights()])

## Training and testing the reference keras model

Let's first train the reference keras model. We will use the initial weights from the nnvm graph to make comparison fairer.

In [55]:
keras_model.fit_generator(batches(x_train, y_train), steps_per_epoch=int(len(x_train) / batch_size))

test_loss = 0
for step, (xs, ys) in enumerate(batches(x_test, y_test)):
    test_loss += keras_model.test_on_batch(xs, ys)
        
test_loss /= step

print("keras test loss: {}".format(test_loss))

Epoch 1/1
keras test loss: 0.40138487846820387


## Training the nnvm model

Train the nnvm model.

In [10]:
%%time
seen = 0
for step, (xs, ys) in enumerate(batches(x_train, y_train)):
    # load data
    training_module.set_input('x', xs)
    training_module.set_input('y', ys)
    # run a training step
    training_module.run()
    
    seen += xs.shape[0]
    train_loss = training_module.get_output(0, tvm.nd.empty((1,))).asnumpy()[0]
    
    if step % 10 == 0:
        print("seen: {}  train loss: {}".format(seen, train_loss), end='\r')
        
print("")

seen: 59008  train loss: 0.46966883540153503
CPU times: user 2min 8s, sys: 5min 57s, total: 8min 6s
Wall time: 24.4 s


## Testing  the nnvm model

Move weights to the testing module.

In [11]:
for name, val in get_weights():
    testing_module.set_input(name, val)

Compute loss on the test set.

In [12]:
test_loss = 0
for step, (xs, ys) in enumerate(batches(x_test, y_test)):
    testing_module.set_input('x', xs)
    testing_module.set_input('y', ys)
    testing_module.run()
    
    test_loss += testing_module.get_output(0, tvm.nd.empty((1,))).asnumpy()[0]
        
test_loss /= step
print("test loss: {}".format(test_loss))

test loss: 0.6421400455685405


To make sure that we compute everything correctly, move nnvm weights to the keras model and compute the test loss using keras.

In [13]:
assign_nnvm_weights_to_keras()

In [14]:
test_loss = 0
for step, (xs, ys) in enumerate(batches(x_test, y_test)):
    test_loss += keras_model.test_on_batch(xs, ys)
        
test_loss /= step
print("test loss: {}".format(test_loss))

test loss: 0.6421400262163831
