In [1]:
%matplotlib notebook
from __future__ import division



In [2]:
from matplotlib.pyplot import plot, ylabel, xlabel, yscale, xscale, legend, subplots
from theano import function
import numpy as np
import gzip
import cPickle
from scipy.optimize import minimize
from climin.util import optimizer
from itertools import repeat, cycle, islice, izip
inf = float("inf")

gnumpy: failed to import cudamat. Using npmat instead. No GPU will be used.


In [3]:
from breze.learn.data import one_hot
from breze.learn.base import cast_array_to_local_type
from schlichtanders.myfunctools import compose
from schlichtanders.myoptimizers import batch
from schlichtanders.mygenerators import eatN, chunk, chunk_gen, every
from schlichtanders.myplot import add_val, add_point

In [4]:
from theano_models import (as_tensor_variable, total_size, clone, clone_all,
                           Merge, FlatKey, Reparameterize, squareplus, squareplus_inv,
                           InvertibleModel,
                           inputting_references, outputting_references)
from theano_models.visualization import d3viz
from IPython.display import IFrame
import theano_models.deterministic_models as dm
import theano_models.probabilistic_models as pm
import theano_models.postmaps as post
from theano_models.composing import normalizing_flow, variational_bayes

In [5]:
import theano
from theano.printing import debugprint

In [6]:
inputting_references.update(['to_be_randomized'])
inputting_references

{'flat',
 'inputs',
 'n_data',
 'noise',
 'parameters',
 'parameters_positive',
 'parameters_pvalues',
 'to_be_randomized'}

In [7]:
outputting_references

{'kl_prior', 'logP', 'loglikelihood', 'norm_det', 'outputs'}

# Data

In [8]:
datafile = '../data/mnist.pkl.gz'
# Load data.        a                                                                                           

with gzip.open(datafile,'rb') as f:                                                                        
    train_set, val_set, test_set = cPickle.load(f)                                                       

X, Z = train_set                                                                                               
VX, VZ = val_set
TX, TZ = test_set

Z = one_hot(Z, 10)
VZ = one_hot(VZ, 10)
TZ = one_hot(TZ, 10)

image_dims = 28, 28

X, Z, VX, VZ, TX, TZ = [cast_array_to_local_type(i) for i in (X, Z, VX,VZ, TX, TZ)]
map(np.shape, [X, Z, VX, VZ, TX, TZ])

[(50000, 784),
 (50000, 10),
 (10000, 784),
 (10000, 10),
 (10000, 784),
 (10000, 10)]

# Model

## data modelling

In [55]:
predictor = dm.Mlp(output_size=10, output_transfer="softmax", hidden_sizes=[300]*1, hidden_transfers=["rectifier"]*1)
# post.flatten_parameters(predictor)
predictor

Mlp5 { 'inputs': [AffineNonlinear9.inputs.0],
  'outputs': AffineNonlinear10.outputs,
  'parameters': [weights9, bias9, weights10, bias10]}

In [56]:
target_distribution = pm.Categorical(predictor)
target_distribution

Categorical5 { 'inputs': [],
  'logP': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'outputs': Categorical5.outputs,
  'parameters_pvalues': [AffineNonlinear10.outputs]}

In [57]:
target_distribution

Categorical5 { 'inputs': [],
  'logP': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'outputs': Categorical5.outputs,
  'parameters_pvalues': [AffineNonlinear10.outputs]}

In [58]:
targets = Merge(target_distribution, predictor, FlatKey(predictor, flat_key="to_be_randomized"))
targets

Merge13 { 'inputs': [AffineNonlinear9.inputs.0],
  'logP': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'outputs': Categorical5.outputs,
  'parameters': [],
  'parameters_pvalues': [],
  'to_be_randomized': "weights9_copy:bias9_copy:weights10_copy:bias10_copy"}

## parameter modelling

In [59]:
params_base = pm.DiagGauss(output_size=total_size(targets['to_be_randomized']))  # if you want to use size directly, CAUTION, you need to copy before!
# params_base.map('parameters_positive', reparameterize_map(squareplus, squareplus_inv), 'parameters')
params_base

DiagGauss5 { 'inputs': [],
  'logP': <FunctionWrapper at 0x7f3924793590 for function at 0x7f39247bf8c0>,
  'noise': [DiagGaussianNoise5.noise.0],
  'outputs': DiagGaussianNoise5.outputs,
  'parameters': [mean9],
  'parameters_positive': [var9]}

In [60]:
normflows = [dm.PlanarTransform() for _ in range(2)]
normflows

[PlanarTransform9 { 'inputs': [z9],
   'norm_det': PlanarTransform9.norm_det,
   'outputs': PlanarTransform9.outputs,
   'parameters': [b9, w9, _u9]}, PlanarTransform10 { 'inputs': [z10],
   'norm_det': PlanarTransform10.norm_det,
   'outputs': PlanarTransform10.outputs,
   'parameters': [b10, w10, _u10]}]

In [61]:
params = params_base
for transform in normflows:
    params = normalizing_flow(transform, params)  # returns transform, however with adapted logP    

params

normalized_flow10 { 'inputs': [],
  'logP': <FunctionWrapper at 0x7f39247934b0 for function at 0x7f392474bde8>,
  'noise': [DiagGaussianNoise5.noise.0],
  'norm_det': PlanarTransform10.norm_det,
  'outputs': PlanarTransform10.outputs,
  'parameters': [b10, w10, _u10, b9, w9, _u9, mean9],
  'parameters_positive': [var9]}

## bayes

In [62]:
prior = pm.Gauss(total_size(targets['to_be_randomized']))
del prior['parameters']  # mean is not adapted at all, but left centred at zero
prior

Gauss5 { 'inputs': [],
  'logP': <FunctionWrapper at 0x7f3924793600 for function at 0x7f39246f8848>,
  'noise': [GaussianNoise5.noise.0],
  'outputs': GaussianNoise5.outputs,
  'parameters_positive': [var10]}

In [63]:
model = variational_bayes(targets, 'to_be_randomized', params, priors=prior, merge_priors=True)
model

variational_lower_bound5 { 'inputs': [AffineNonlinear9.inputs.0],
  'kl_prior': kl_prior,
  'logP': <FunctionWrapper at 0x7f39247937c0 for function at 0x7f392471a848>,
  'loglikelihood': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'n_data': n_data,
  'noise': [DiagGaussianNoise5.noise.0, GaussianNoise5.noise.0],
  'norm_det': PlanarTransform10.norm_det,
  'outputs': Categorical5.outputs,
  'parameters': [b10, w10, _u10, b9, w9, _u9, mean9],
  'parameters_positive': [var9, var10],
  'parameters_pvalues': []}

In [64]:
model = Merge(model, Reparameterize(model['parameters_positive'], squareplus, squareplus_inv))
model

Merge14 { 'inputs': [AffineNonlinear9.inputs.0],
  'kl_prior': kl_prior,
  'logP': <FunctionWrapper at 0x7f39247937c0 for function at 0x7f392471a848>,
  'loglikelihood': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'n_data': n_data,
  'noise': [DiagGaussianNoise5.noise.0, GaussianNoise5.noise.0],
  'norm_det': PlanarTransform10.norm_det,
  'outputs': Categorical5.outputs,
  'parameters': [ b10,
                  w10,
                  _u10,
                  b9,
                  w9,
                  _u9,
                  mean9,
                  var9_squareplus,
                  var10_squareplus],
  'parameters_positive': [],
  'parameters_pvalues': []}

In [65]:
model = Merge(model,
              FlatKey(model),
              FlatKey(model, key="noise", flat_key="noise_flat"))
model

Merge15 { 'flat': "b10_copy:w10_copy:_u10_copy:b9_copy:w9_copy:_u9_copy:mean9_copy:var9_squareplus_copy:var10_squareplus_copy",
  'inputs': [AffineNonlinear9.inputs.0],
  'kl_prior': kl_prior,
  'logP': <FunctionWrapper at 0x7f39247937c0 for function at 0x7f392471a848>,
  'loglikelihood': <FunctionWrapper at 0x7f3924a16f30 for function at 0x7f39247c9410>,
  'n_data': n_data,
  'noise': [],
  'noise_flat': "DiagGaussianNoise5.noise.0_copy:GaussianNoise5.noise.0_copy",
  'norm_det': PlanarTransform10.norm_det,
  'outputs': Categorical5.outputs,
  'parameters': [],
  'parameters_positive': [],
  'parameters_pvalues': []}

# Optimizer

In [66]:
InvertibleModel.reduce_all_identities()

In [67]:
postmap = compose(post.flat_numericalize_postmap, post.variational_postmap)
postmap_kwargs = {
    'wrapper': batch,
    'initial_inputs': [X[0]],
    'adapt_init_params': lambda ps: ps + np.random.normal(size=ps.size) * 0.01,
    'profile': True,
    'mode': 'FAST_RUN'
}
optimizer_kwargs = postmap(model, **postmap_kwargs)
climin_kwargs = post.climin_postmap(optimizer_kwargs)
climin_kwargs

{'f': <function schlichtanders.myoptimizers.f_batch>,
 'fprime': <function schlichtanders.myoptimizers.f_batch>,
 'wrt': array([-0.00108581,  1.01182628,  0.99703656, ...,  0.99346847,
         1.00343572,  0.99808337])}

Climin wants an iterator of (args, kwarsg) as keyword argument "args" (to be passed to the loss function). Concretley, we use an infinite iterator over minibatches.

TODO write chunk with O(M) M=number of minibatches
the current implementation is O(N) N=number of samples, because of use of generators instead of list slicing

try to use ``chunk_gen``

In [None]:
batch_size = 200
climin_args = izip(izip(chunk(batch_size, cycle(Z)), chunk(batch_size, cycle(X))), repeat({}))

In [None]:
opt = optimizer(
    identifier = "adadelta",
    args=climin_args, # repeat(((Z,X),{})),
    **climin_kwargs
)

# Visualizing model

In [None]:
# f = theano.function([model['flat']] + model['inputs'], model['outputs'], mode="FAST_COMPILE")
f = [model['flat']] + model['inputs'], model['outputs']
d3viz(f, 'tmp/model.html', match_by_names=True) #, [targets, target_distribution, predictor, params] + predictor.layers + normflows + [params_base] + Helper.all_helpers[::-1])
IFrame('tmp/model.html', width=700, height=500)

# Visualizing Loss

In [None]:
optimizer_kwargs['loss']

In [None]:
optimizer_kwargs['num_loss'](optimizer_kwargs['num_parameters'], Z[0:1], X[0:1])

f = optimizer_kwargs['num_loss'].wrapped
d3viz(f, 'tmp/loss.html', match_by_names=True)
IFrame('tmp/loss.html', width=700, height=500)

# Visualizing gradient

In [None]:
optimizer_kwargs['num_jacobian'](optimizer_kwargs['num_parameters'], Z[0:100], X[0:100])

f = optimizer_kwargs['num_jacobian'].wrapped
d3viz(f, 'tmp/gradient.html', match_by_names=True)
IFrame('tmp/gradient.html', width=700, height=500)

# Visualized Fit

In [None]:
line_train, = plot([], [], 'go-', label="average training loss")
line_curr_val, = plot([],[], 'bo:', label="avrg current validation loss")
line_best_val, = plot([], [], 'ko-', label="avrg best validation loss")
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
yscale('log')
ylabel("validation loss")
xlabel("#iteration")
legend(loc='lower left', fancybox=True, framealpha=0.5)

In [None]:
best_val_loss = inf
best_wrt = None
val_size = batch_size

n_whole_data = X.shape[0] // batch_size

for info in every(n_whole_data, opt):
    # collect and visualize validation loss for choosing the best model
    val_loss = optimizer_kwargs['num_loss'](opt.wrt, VZ[:val_size], VX[:val_size])/val_size
    if val_loss < best_val_loss:
        best_wrt = opt.wrt
        best_val_loss = val_loss
        add_point(line_best_val, info['n_iter'], val_loss)
    add_point(line_curr_val, info['n_iter'], val_loss)
    
    # visualize training loss for comparison:
    try:
        training_loss = info['loss'] / len(Z)  # TODO normalization needed?
    except KeyError:
        training_loss = optimizer_kwargs['num_loss'](opt.wrt, Z, X)/len(Z)
    add_point(line_train, info['n_iter'], training_loss)

# Performance

TODO: average over predictions

In [None]:
print best_val_loss
mlp['parameters_flat'] = best_wrt

predict = mlp.function()
predict(X[0]), Z[0]

In [None]:
PX = np.apply_along_axis(predict, 1, X)
PVX = np.apply_along_axis(predict, 1, VX)
PTX = np.apply_along_axis(predict, 1, TX)

In [None]:
print 'incorrect samples train/val/test:  %i/%i/%i' % (
    (PX[:, :10].argmax(1) != Z.argmax(1)).sum(),
    (PVX[:, :10].argmax(1) != VZ.argmax(1)).sum(),
    (PTX[:, :10].argmax(1) != TZ.argmax(1)).sum())

print 'error rate train/val/test:  %g/%g/%g' % (
    (PX[:, :10].argmax(1) != Z.argmax(1)).mean(),
    (PVX[:, :10].argmax(1) != VZ.argmax(1)).mean(),
    (PTX[:, :10].argmax(1) != TZ.argmax(1)).mean())