In [1]:
%pylab inline

import theano
import theano.tensor.signal.downsample

from fuel.transformers import ScaleAndShift, Cast
from fuel.streams import DataStream
from fuel.schemes import ShuffledScheme, SequentialScheme

from blocks import *

Populating the interactive namespace from numpy and matplotlib


In [14]:
### Generic config

Cfg_Data_BatchSize_Train    = 20
Cfg_Data_BatchSize_Validate = 100
Cfg_Data_BatchSize_Test     = 100

Cfg_Theano_Debug = True

theano.config.floatX = 'float32'
if Cfg_Theano_Debug:
    print "Set Theano to debug mode."
    theano.config.optimizer = 'fast_compile'
    theano.config.exception_verbosity = 'high'
else:
    theano.config.optimizer = 'fast_run'
    theano.config.exception_verbosity = 'low'


Set Theano to debug mode.


In [3]:
from fuel.datasets.cifar10 import CIFAR10

CIFAR10.default_transformers = (
    (ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}),
    (Cast, [np.float32], {'which_sources': 'features'})
)

Data_Train      = CIFAR10(("train",), subset=slice(None ,45000))
Data_Validation = CIFAR10(("train",), subset=slice(45000, None))
Data_Test       = CIFAR10(("test" ,)                           )

Stream_Train     = DataStream.default_stream(
                      Data_Train,
                      iteration_scheme=ShuffledScheme(Data_Train.num_examples       , Cfg_Data_BatchSize_Train)
                   )
Stream_Validation = DataStream.default_stream(
                      Data_Validation,
                      iteration_scheme=SequentialScheme(Data_Validation.num_examples, Cfg_Data_BatchSize_Validate)
                   )
Stream_Test       = DataStream.default_stream(
                      Data_Test,
                      iteration_scheme=SequentialScheme(Data_Test.num_examples      , Cfg_Data_BatchSize_Test)
                   )

print "Loaded input data."
print "Subset sizes: Train %d, Validation %d, Test %d" % (Data_Train.num_examples, Data_Validation.num_examples, Data_Test.num_examples)

def GetNextBatch(stream):
    return next(stream.get_epoch_iterator())

print "Stream data shapes:"
x,y = GetNextBatch(Stream_Train)
print "Stream_Train: image batch of shape %s [%s], and label batch of shape %s [%s]" % (x.shape, x.dtype, y.shape, y.dtype)
x,y = GetNextBatch(Stream_Test)
print "Stream_Test: image batch of shape %s [%s], and label batch of shape %s [%s]" % (x.shape, x.dtype, y.shape, y.dtype)

Loaded input data.
Subset sizes: Train 45000, Validation 5000, Test 10000
Stream data shapes:
Stream_Train: image batch of shape (20, 3, 32, 32) [float32], and label batch of shape (20, 1) [uint8]
Stream_Test: image batch of shape (100, 3, 32, 32) [float32], and label batch of shape (100, 1) [uint8]


In [20]:
# A theano variable is an entry to the cmputational graph
# We will need to provide its value during function call
# X is batch_size x num_channels x img_rows x img_columns
X = theano.tensor.tensor4('X')

# Y is 1D, it lists the targets for all examples
Y = theano.tensor.matrix('Y', dtype='uint8')

X_test_value, Y_test_value = GetNextBatch(Stream_Train)
theano.config.compute_test_value = 'off' # Disable the computation of test values

X.tag.test_value = X_test_value[:3]
Y.tag.test_value = Y_test_value[:3]

# this list will hold all parameters of the network
model_parameters = []

#The first convolutional layer
#The shape is: num_out_filters x num_in_filters x filter_height x filter_width
num_filters_1 = 10 #we will apply that many convolution filters in the first layer
CW1 = theano.shared(np.zeros((num_filters_1,3,5,5), dtype='float32'),
                   name='CW1')
#please note - this is somewhat non-standard
CW1.tag.initializer = IsotropicGaussian(0.05)

CB1 = theano.shared(np.zeros((num_filters_1,), dtype='float32'),
                    name='CB1')
CB1.tag.initializer = Constant(0.0)
model_parameters += [CW1, CB1]

after_C1 = theano.tensor.maximum(
    0.0,
    theano.tensor.nnet.conv2d(X, CW1, filter_shape=(5,5)) + CB1.dimshuffle('x',0,'x','x')
    )
# print "after_C1 shape: %s" % (after_C1.tag.test_value.shape,)
after_P1 = theano.tensor.signal.downsample.max_pool_2d(after_C1, (2,2), ignore_border=True)
# print "after_P1 shape: %s" % (after_P1.tag.test_value.shape,)


num_filters_2 = 25 #we will compute ten convolution filters in the first layer
CW2 = theano.shared(np.zeros((num_filters_2,num_filters_1,5,5), dtype='float32'),
                   name='CW2')
CW2.tag.initializer = IsotropicGaussian(0.05)

CB2 = theano.shared(np.zeros((num_filters_2,), dtype='float32'),
                    name='CB2')
CB2.tag.initializer = Constant(0.0)
model_parameters += [CW2, CB2]

after_C2 = theano.tensor.maximum(
    0.0,
    theano.tensor.nnet.conv2d(after_P1, CW2) + CB2.dimshuffle('x',0,'x','x')
    )
# print "after_C2 shape: %s" % (after_C2.tag.test_value.shape,)
after_P2 = theano.tensor.signal.downsample.max_pool_2d(after_C2, (2,2), ignore_border=True)
# print "after_P2 shape: %s" % (after_P2.tag.test_value.shape,)

#Fully connected layers - we just flatten all filter maps
num_fw3_hidden=500
FW3 = theano.shared(np.zeros((num_filters_2 * 5 * 5, num_fw3_hidden), dtype='float32'),
                   name='FW3')
FW3.tag.initializer = IsotropicGaussian(0.05)

FB3 = theano.shared(np.zeros((num_fw3_hidden,), dtype='float32'),
                    name='FB3')
FB3.tag.initializer = Constant(0.0)
model_parameters += [FW3, FB3]

after_F3 = theano.tensor.maximum(0.0, 
                                 theano.tensor.dot(after_P2.flatten(2), FW3) + FB3.dimshuffle('x',0))
# print "after_F3 shape: %s" % (after_F3.tag.test_value.shape,)


num_fw4_hidden=10
FW4 = theano.shared(np.zeros((num_fw3_hidden, num_fw4_hidden), dtype='float32'),
                   name='FW4')
FW4.tag.initializer = IsotropicGaussian(0.05)

FB4 = theano.shared(np.zeros((num_fw4_hidden,), dtype='float32'),
                    name='FB4')
FB4.tag.initializer = Constant(0.0)
model_parameters += [FW4, FB4]

after_F4 = theano.tensor.dot(after_F3, FW4) + FB4.dimshuffle('x',0)
# print "after_F4 shape: %s" % (after_F4.tag.test_value.shape,)

log_probs = theano.tensor.nnet.softmax(after_F4)

predictions = theano.tensor.argmax(log_probs, axis=1)

error_rate = theano.tensor.neq(predictions,Y.ravel()).mean()
nll = - theano.tensor.log(log_probs[theano.tensor.arange(Y.shape[0]), Y.ravel()]).mean()

weight_decay = 0.0
for p in model_parameters:
    if p.name[1]=='W':
        weight_decay = weight_decay + 1e-3 * (p**2).sum()

cost = nll + weight_decay

#At this point stop computing test values
theano.config.compute_test_value = 'off' # Enable the computation of test values

In [21]:
# The updates will update our shared values
updates = []

lrate = theano.tensor.scalar('lrate',dtype='float32')
momentum = theano.tensor.scalar('momentum',dtype='float32')

# Theano will compute the gradients for us
gradients = theano.grad(cost, model_parameters)

#initialize storage for momentum
velocities = [theano.shared(np.zeros_like(p.get_value()), name='V_%s' %(p.name, )) for p in model_parameters]

for p,g,v in zip(model_parameters, gradients, velocities):
    v_new = momentum * v - lrate * g
    p_new = p + v_new
    updates += [(v,v_new), (p, p_new)]


In [22]:
#compile theano functions

#each call to train step will make one SGD step
train_step = theano.function([X,Y,lrate,momentum],[cost, error_rate, nll, weight_decay],updates=updates)
#each call to predict will return predictions on a batch of data
predict = theano.function([X], predictions)

In [23]:
def compute_error_rate(stream):
    errs = 0.0
    num_samples = 0.0
    for X, Y in stream.get_epoch_iterator():
        errs += (predict(X)!=Y.ravel()).sum()
        num_samples += Y.shape[0]
    return errs/num_samples

def init_parameters():
    rng = numpy.random.RandomState(1234)
    for p in model_parameters:
        p.set_value(p.tag.initializer.generate(rng, p.get_value().shape))

def snapshot_parameters():
    return [p.get_value(borrow=False) for p in model_parameters]

def load_parameters(snapshot):
    for p, s in zip(model_parameters, snapshot):
        p.set_value(s, borrow=False)

In [None]:
i=0
e=0

init_parameters()
for v in velocities:
    v.set_value(np.zeros_like(v.get_value()))

best_valid_error_rate = np.inf
best_params = snapshot_parameters()
best_params_epoch = 0

train_erros = []
train_loss = []
train_nll = []
validation_errors = []

number_of_epochs = 3
patience_expansion = 1.5
# training loop

while e<number_of_epochs: #This loop goes over epochs
    e += 1
    #First train on all data from this batch
    epoch_start_i = i
    for X_batch, Y_batch in Stream_Train.get_epoch_iterator(): 
        i += 1
        
        K = 2000
        lrate = 4e-3 * K / np.maximum(K, i)
        momentum=0.9
        
        L, err_rate, nll, wdec = train_step(X_batch, Y_batch, lrate, momentum)
        
        #print [p.get_value().ravel()[:10] for p in model_parameters]
        #print [p.get_value().ravel()[:10] for p in velocities]
        
        
        train_loss.append((i,L))
        train_erros.append((i,err_rate))
        train_nll.append((i,nll))
        if i % 100 == 0:
            print "At minibatch %d, batch loss %f, batch nll %f, batch error rate %f%%" % (i, L, nll, err_rate*100)
        
    # After an epoch compute validation error
    val_error_rate = compute_error_rate(Stream_Validation)
    if val_error_rate < best_valid_error_rate:
        number_of_epochs = np.maximum(number_of_epochs, e * patience_expansion+1)
        best_valid_error_rate = val_error_rate
        best_params = snapshot_parameters()
        best_params_epoch = e
    validation_errors.append((i,val_error_rate))
    print "After epoch %d: valid_err_rate: %f%% currently going to do %d epochs" %(
        e, val_error_rate*100, number_of_epochs)
    print "After epoch %d: averaged train_err_rate: %f%% averaged train nll: %f averaged train loss: %f" %(
        e, np.mean(np.asarray(train_erros)[epoch_start_i:,1])*100, 
        np.mean(np.asarray(train_nll)[epoch_start_i:,1]),
        np.mean(np.asarray(train_loss)[epoch_start_i:,1]))


At minibatch 100, batch loss 2.703885, batch nll 1.900320, batch error rate 75.000000%
At minibatch 200, batch loss 2.692368, batch nll 1.899582, batch error rate 80.000000%
At minibatch 300, batch loss 2.430619, batch nll 1.647972, batch error rate 45.000000%
At minibatch 400, batch loss 2.370766, batch nll 1.598010, batch error rate 60.000000%
At minibatch 500, batch loss 2.491653, batch nll 1.728923, batch error rate 45.000000%
At minibatch 600, batch loss 2.298749, batch nll 1.545995, batch error rate 50.000000%
At minibatch 700, batch loss 1.807559, batch nll 1.064499, batch error rate 35.000000%
At minibatch 800, batch loss 2.504328, batch nll 1.771201, batch error rate 50.000000%
At minibatch 900, batch loss 2.228119, batch nll 1.504578, batch error rate 65.000000%
At minibatch 1000, batch loss 2.714745, batch nll 2.000682, batch error rate 65.000000%
At minibatch 1100, batch loss 2.408517, batch nll 1.703583, batch error rate 50.000000%
At minibatch 1200, batch loss 1.939713, b