In [1]:
from mxnet import nd as np
from mxnet import autograd
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [2]:
from PIL import Image
i = Image.fromarray(x_train[0])
i.show()

print(y_train[0])

5


In [3]:
def relu(x):
    return np.maximum(x, np.zeros_like(x))

print(relu(np.array([9, 4, 3, -1, -4])))


[9. 4. 3. 0. 0.]
<NDArray 5 @cpu(0)>


In [4]:
def softmax(x):
    x = x-np.max(x)
    return np.exp(x) / np.sum(np.exp(x))

print(softmax(np.array([0.3, 1.9, 0.3, 0.4])))


[0.12409715 0.61465716 0.12409715 0.13714856]
<NDArray 4 @cpu(0)>


In [5]:
def crossentropy(y, yhat):
    return -np.nansum(y*np.log(yhat), axis=1)

print(crossentropy(np.array([[0]]), np.array([[0]])))


[-0.]
<NDArray 1 @cpu(0)>


In [6]:
def sgd(params, lr):
    for param in params:
        param[:] -= lr*param.grad

In [7]:
input_size = 784
output_size = 10
h1_size = 300
h2_size = 100
scale = 0.01

W1 = np.normal(shape=(input_size, h1_size), scale=scale)
W2 = np.normal(shape=(h1_size, h2_size), scale=scale)
W3 = np.normal(shape=(h2_size, output_size), scale=scale)

b1 = np.normal(shape=h1_size, scale=scale)
b2 = np.normal(shape=h2_size, scale=scale)
b3 = np.normal(shape=output_size, scale=scale)

In [8]:
params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

In [9]:
def net(x):
    int1 = relu(np.dot(x, W1) + b1)
    int2 = relu(np.dot(int1, W2) + b2)
    return softmax(np.dot(int2, W3) + b3)

In [10]:
x_train = np.array(x_train) / 255.
x_test = np.array(x_test) / 255.

y_train = np.array(y_train)
y_test = np.array(y_test)

In [11]:
def evaluate(data, label):
    num_iters = len(data) // batch_size
    numerator = 0.0
    denominator = 0.0
    for i in range(num_iters):
        x = data[i*batch_size:(i+1)*batch_size].reshape((-1, input_size))
        y = label[i*batch_size:(i+1)*batch_size]
        output = net(x)
        output = np.argmax(output, axis=1)
        numerator += np.sum(output == y)
        denominator += len(x)
    #return (numerator/denominator).asscalar()
    print (numerator)
    print (denominator) 

In [12]:
epochs = 10
batch_size = 64
num_iters = len(x_train) // batch_size
lr = 0.001

for e in range(epochs):
    cur_loss = 0.0
    for i in range(num_iters):
        x = x_train[i*batch_size:(i+1)*batch_size].reshape((-1, input_size))
        y = y_train[i*batch_size:(i+1)*batch_size]
        y = np.one_hot(y, output_size)
        with autograd.record():
            output = net(x)
            loss = crossentropy(y, output)
        loss.backward()
        cur_loss += np.sum(loss).asscalar()
        sgd(params, lr)
    #print('Training loss after epoch %d = %f. Training accuracy = %f. Testing accuracy = %f' %    
          #(e, cur_loss/batch_size, evaluate(x_train, y_train), evaluate(x_test, y_test)))
        

In [13]:
evaluate(x_test, y_test)


[9716.]
<NDArray 1 @cpu(0)>
9984.0
