In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import time
import Image

import theano
import theano.tensor as T

import lasagne

from collections import OrderedDict
from lasagne import utils

import subprocess

In [2]:
def make_seed(n):
    np.random.seed(n)
    lasagne.random.set_rng(np.random)

In [3]:
def build_mlp(input_var=None, BN=False):
    l_in = lasagne.layers.InputLayer(shape=(None, 1, sz, sz),
                                     input_var=input_var)
    if BN:
        l_in = lasagne.layers.batch_norm(l_in)

    l_hid1 = lasagne.layers.DenseLayer(
            l_in, num_units=100,
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform())
    if BN:
        l_hid1 = lasagne.layers.batch_norm(l_hid1)

    l_hid2 = lasagne.layers.DenseLayer(
            l_hid1, num_units=100,
            nonlinearity=lasagne.nonlinearities.rectify)
    if BN:
        l_hid2 = lasagne.layers.batch_norm(l_hid2)
        
    l_hid3 = lasagne.layers.DenseLayer(
            l_hid2, num_units=100,
            nonlinearity=lasagne.nonlinearities.rectify)
    if BN:
        l_hid3 = lasagne.layers.batch_norm(l_hid3)

    l_out = lasagne.layers.DenseLayer(
            l_hid3, num_units=10,
            nonlinearity=lasagne.nonlinearities.softmax)
    return l_out

In [4]:
def get_data():
    subprocess.call(['th', '-l', 'my_example'])
    n, d = 10000, 3600
    train_images = np.zeros((n, d), dtype=np.uint8)
    train_labels = np.zeros(n, dtype=np.uint8)
    for i in range(n):
        image_open = Image.open("clMNIST/example" + str(i) + ".png")
        a = np.array(image_open.getdata())
        train_images[i] = a 
        train_labels[i] = np.uint(np.loadtxt("clMNIST/y" + str(i)))
    return np.reshape(train_images, (-1, 1, 60, 60)), np.ravel(train_labels)

In [5]:
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [6]:
def adam_update(loss_or_grads, params, learning_rate=1e-3, beta1=0.9,
                        beta2=0.999, epsilon=1e-8):
    all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)
    t_prev = theano.shared(utils.floatX(0.))
    updates = OrderedDict()

    t = t_prev + 1
    a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)

    for param, g_t in zip(params, all_grads):
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (1 - beta1) * g_t
        v_t = beta2 * v_prev + (1 - beta2) * g_t ** 2
        step = a_t * m_t / (T.sqrt(v_t) + epsilon)

        updates[m_prev] = m_t
        updates[v_prev] = v_t
        updates[param] = param - step

    updates[t_prev] = t
    return updates


def adam_update2(loss_or_grads, params, learning_rate=1e-3, beta1=0.9,
                 beta2=0.999, epsilon=1e-6):
    all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)


    t_prev = theano.shared(utils.floatX(0.))
    updates = OrderedDict()

    t = t_prev + 1
    a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)

    for param, g_t in zip(params, all_grads):
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (1 - beta1) * g_t
        v_t = beta2 * v_prev + (1 - beta2) * g_t ** 2
        step = a_t * m_t / (T.sqrt(v_t + epsilon))

        updates[m_prev] = m_t
        updates[v_prev] = v_t
        updates[param] = param - step

    updates[t_prev] = t
    return updates

In [7]:
sz = 60

In [14]:
def run_method(method, model='mlp', BN=False, num_epochs=50, alpha=0.1, mu=0.9, beta1=0.9, beta2=0.999, echo=False, 
               batch_size=100, epsilon=1e-8):
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    if echo:
        print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var, BN)
    else:
        print("Unrecognized model type %r." % model)
        return

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var),
                 dtype=theano.config.floatX)

    params = lasagne.layers.get_all_params(network, trainable=True)

    if method == lasagne.updates.sgd:
        updates = method(loss, params, learning_rate=alpha)
    elif method == lasagne.updates.momentum:
        updates = method(loss, params, learning_rate=alpha, momentum=mu)
    elif method == lasagne.updates.adam:
        updates = method(loss, params, learning_rate=alpha, beta1=beta1)
    elif method == adam_update or method == adam_update2:
        updates = method(loss, params, learning_rate=alpha, beta1=beta1, beta2=beta2, epsilon=epsilon)
    else:
        updates = method(loss, params, learning_rate=alpha, epsilon=epsilon)


    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    train_fn_acc = theano.function([input_var, target_var], train_acc)
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    if echo:
        print("Starting training...")

    res = dict()
    arr_train_err = []
    arr_val_err = []
    arr_train_acc = []
    arr_val_acc = []

    for epoch in range(num_epochs):
        train_err = 0
        train_batches = 0
        train_acc = 0
        start_time = time.time()
        
        train_inputs, train_targets = get_data()
        
        for batch in iterate_minibatches(train_inputs, train_targets, batch_size, shuffle=True):
            inputs, targets = batch
            err = train_fn(inputs, targets)
            acc = train_fn_acc(inputs, targets)
            train_err += err
            train_acc += acc
            train_batches += 1

        arr_train_err.append(train_err / train_batches)
        arr_train_acc.append(train_acc / train_batches * 100)
        
        val_inputs, val_targets = get_data()

        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(val_inputs, val_targets, batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        arr_val_err.append(val_err / val_batches)
        arr_val_acc.append(val_acc / val_batches * 100)

        if echo:
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
#             print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
#             print("  validation accuracy:\t\t{:.2f} %".format(
#                 val_acc / val_batches * 100))

    test_err = 0
    test_acc = 0
    test_batches = 0
    
    test_inputs, test_targets = get_data()
        
    for batch in iterate_minibatches(test_inputs, test_targets, batch_size, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1

    if echo:
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(
            test_acc / test_batches * 100))

    res['train_err'] = np.array(arr_train_err)
    res['val_err'] = np.array(arr_val_err)
    res['train_acc'] = np.array(arr_train_acc)
    res['val_acc'] = np.array(arr_val_acc)
    res['test_err'] = test_err / test_batches
    res['test_acc'] = test_acc / test_batches * 100

    return res

In [15]:
%%time
make_seed(100)
adagrad = run_method(lasagne.updates.adagrad, num_epochs=50, alpha=1e-2)

CPU times: user 17min 57s, sys: 7min 45s, total: 25min 43s
Wall time: 24min 45s


In [None]:
import pickle
with open('adagrad', 'rb') as f:
    adagrad = pickle.load(f)

In [16]:
%%time
make_seed(100)
bn_adagrad = run_method(lasagne.updates.adagrad, num_epochs=50, alpha=1e-1, BN=True)

CPU times: user 21min 24s, sys: 16min 17s, total: 37min 41s
Wall time: 28min 21s


In [17]:
import pickle
with open('adagrad', 'wb') as f:
    pickle.dump(adagrad, f)
    pickle.dump(bn_adagrad, f)

In [43]:
plt.plot(momentum['train_acc'], 'b--')
plt.plot(momentum['val_acc'], 'b')
plt.plot(bn_momentum['train_acc'], 'r--')
plt.plot(bn_momentum['val_acc'], 'r')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.xlim(xmax=49)
plt.legend(['Momentum train', 'Momentum validation', 'BN Momentum train', 'BN Momentum validation'], loc=0, fontsize=12)
plt.title('Momentum, 3HL')
plt.show()

In [27]:
%%time
make_seed(100)
adadelta = run_method(lasagne.updates.adadelta, num_epochs=50, alpha=1.0)

CPU times: user 17min 58s, sys: 10min 3s, total: 28min 1s
Wall time: 25min 7s


In [28]:
%%time
make_seed(100)
bn_adadelta = run_method(lasagne.updates.adadelta, num_epochs=50, alpha=1.0, BN=True)

CPU times: user 20min 28s, sys: 17min 22s, total: 37min 50s
Wall time: 28min 28s


In [29]:
import pickle
with open('adadelta', 'wb') as f:
    pickle.dump(adadelta, f)
    pickle.dump(bn_adadelta, f)

In [30]:
%%time
make_seed(100)
rmsprop = run_method(lasagne.updates.rmsprop, num_epochs=50, alpha=1e-2)

CPU times: user 17min 40s, sys: 8min 37s, total: 26min 18s
Wall time: 23min 54s


In [31]:
%%time
make_seed(100)
bn_rmsprop = run_method(lasagne.updates.rmsprop, num_epochs=50, alpha=1e-2, BN=True)

CPU times: user 20min 32s, sys: 16min 13s, total: 36min 46s
Wall time: 24min 38s


In [32]:
import pickle
with open('rmsprop', 'wb') as f:
    pickle.dump(rmsprop, f)
    pickle.dump(bn_rmsprop, f)

In [33]:
%%time
make_seed(100)
sgd = run_method(lasagne.updates.sgd, num_epochs=50, alpha=1e-1)

CPU times: user 15min 12s, sys: 5min 13s, total: 20min 26s
Wall time: 21min 20s


In [34]:
%%time
make_seed(100)
bn_sgd = run_method(lasagne.updates.sgd, num_epochs=50, alpha=1.0, BN=True)

CPU times: user 18min 7s, sys: 12min 10s, total: 30min 17s
Wall time: 23min 20s


In [35]:
import pickle
with open('sgd', 'wb') as f:
    pickle.dump(sgd, f)
    pickle.dump(bn_sgd, f)

In [36]:
%%time
make_seed(100)
momentum = run_method(lasagne.updates.momentum, num_epochs=50, alpha=1e-1)

CPU times: user 15min 9s, sys: 4min 39s, total: 19min 48s
Wall time: 19min 32s


In [37]:
%%time
make_seed(100)
bn_momentum = run_method(lasagne.updates.momentum, num_epochs=50, alpha=1.0, BN=True)

CPU times: user 17min 58s, sys: 11min 38s, total: 29min 37s
Wall time: 20min 25s


In [38]:
import pickle
with open('momentum', 'wb') as f:
    pickle.dump(momentum, f)
    pickle.dump(bn_momentum, f)