todo: 5, check out how to do pretrained dnn as feature extractor

# Modern net

In [4]:
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
from load import mnist

In [5]:
srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

In [6]:
def rectify(X):
    return T.maximum(X, 0.)

Numerically stable softmax

In [7]:
def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

`moving average(magnitude(gradient))`, to control learning rate near target

scale the gradient based on `moving average`.

In [8]:
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

Randomly drop out and scale the rest

In [9]:
def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

2 hidden layers

In [12]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):

    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))

    return h, h2, py_x

Add noise to model

In [13]:
trX, teX, trY, teY = mnist(onehot=True)

X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)

h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)

y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))

params = [w_h, w_h2, w_o]

updates = RMSprop(cost, params, lr=0.001)

In [15]:
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(100):
    for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
        cost = train(trX[start:end], trY[start:end])
    print(np.mean(np.argmax(teY, axis=1) == predict(teX)))

0.9391
0.9627
0.9723
0.9759
0.9771
0.9789
0.9782
0.9809
0.9815
0.981
0.9824
0.9823
0.9827
0.9834
0.9836
0.9832
0.9841
0.9841
0.9846
0.9852
0.985
0.9844
0.9855
0.9853
0.9857
0.9852
0.986
0.9855
0.9858
0.9857
0.9858
0.9862
0.9859
0.9861
0.9856
0.9863
0.9861
0.9863
0.9874
0.9882
0.9874
0.9861
0.9865
0.9871
0.9856
0.9865
0.9864
0.9862
0.9864
0.9856
0.9858
0.9862
0.9865
0.9861
0.9863
0.9858
0.9861
0.9861
0.9861
0.9867
0.9875
0.987
0.9879
0.9865
0.9866
0.9872
0.9869
0.9867
0.9866
0.9869
0.9867
0.9873
0.9874
0.9866
0.9866
0.9868
0.9874
0.9878
0.9873
0.9877
0.9873
0.9884
0.9868
0.9866
0.9875
0.9875
0.9867
0.9866
0.9869
0.9873
0.9868
0.9869
0.9872
0.9871
0.9869
0.9875
0.9873
0.9871
0.9867
0.9873
