In [None]:
%matplotlib notebook
from __future__ import division

In [None]:
from matplotlib.pyplot import plot, ylabel, xlabel, yscale, xscale, legend, subplots
from theano import function
import numpy as np
import gzip
import cPickle
from scipy.optimize import minimize
from climin.util import optimizer
from itertools import repeat, cycle, islice, izip
inf = float("inf")

In [None]:
from breze.learn.data import one_hot
from breze.learn.base import cast_array_to_local_type
from schlichtanders.myfunctools import compose
from schlichtanders.myoptimizers import batch
from schlichtanders.mygenerators import eatN, chunk, every
from schlichtanders.myplot import add_val, add_point

In [None]:
from theano_models import (as_tensor_variable, total_size, clone, clone_all,
                           Merge, FlatKey, Reparameterize, squareplus, squareplus_inv,
                           InvertibleModel,
                           inputting_references, outputting_references)
from theano_models.visualization import d3viz
from IPython.display import IFrame
import theano_models.deterministic_models as dm
import theano_models.probabilistic_models as pm
import theano_models.postmaps as post
from theano_models.composing import normalizing_flow, variational_bayes

In [None]:
from theano_models import get_equiv_by_name 

In [None]:
from theano.printing import debugprint

In [None]:
inputting_references.update(['to_be_randomized'])
inputting_references

In [None]:
outputting_references

# test

In [None]:
import theano.tensor as T
from theano.gof.graph import variables

In [None]:
a, b = T.dvectors("ab")
c = 2*a + b
c.name = "c"

In [None]:
variables([a,b], [c])

# Data

In [None]:
datafile = '../data/mnist.pkl.gz'
# Load data.        a                                                                                           

with gzip.open(datafile,'rb') as f:                                                                        
    train_set, val_set, test_set = cPickle.load(f)                                                       

X, Z = train_set                                                                                               
VX, VZ = val_set
TX, TZ = test_set

Z = one_hot(Z, 10)
VZ = one_hot(VZ, 10)
TZ = one_hot(TZ, 10)

image_dims = 28, 28

X, Z, VX, VZ, TX, TZ = [cast_array_to_local_type(i) for i in (X, Z, VX,VZ, TX, TZ)]
map(np.shape, [X, Z, VX, VZ, TX, TZ])

# Model

## data modelling

In [None]:
predictor = dm.Mlp(output_size=10, output_transfer="softmax", hidden_sizes=[200]*2, hidden_transfers=["rectifier"]*2)
# post.flatten_parameters(predictor)
predictor

In [None]:
target_distribution = pm.Categorical(predictor)
target_distribution

In [None]:
target_distribution

In [None]:
targets = Merge(target_distribution, predictor, FlatKey(predictor, flat_key="to_be_randomized"))
targets

## parameter modelling

In [None]:
params_base = pm.DiagGauss(output_size=total_size(targets['to_be_randomized']))  # if you want to use size directly, CAUTION, you need to copy before!
# params_base.map('parameters_positive', reparameterize_map(squareplus, squareplus_inv), 'parameters')
params_base

In [None]:
normflows = [dm.PlanarTransform() for _ in range(2)]
normflows

In [None]:
params = params_base
for transform in normflows:
    params = normalizing_flow(transform, params)  # returns transform, however with adapted logP    

params

## bayes

In [None]:
prior = pm.DiagGauss(targets['to_be_randomized'].size)
prior

In [None]:
model = variational_bayes(targets, 'to_be_randomized', params, priors=prior)
model

In [None]:
model = Merge(model, Reparameterize(model['parameters_positive'], squareplus, squareplus_inv))
model

In [None]:
model = Merge(model, FlatKey(model, initial_inputs=[X[0]]))
model

# Optimizer

In [None]:
InvertibleModel.reduce_all_identities()

In [None]:
postmap = compose(post.flat_numericalize_postmap, post.variational_postmap)
postmap_kwargs = {
    'wrapper': batch,
    'initial_inputs': [X[0]],
    'adapt_init_params': lambda ps: ps + np.random.normal(size=ps.size) * 0.01
}
optimizer_kwargs = postmap(model, **postmap_kwargs)
climin_kwargs = post.climin_postmap(optimizer_kwargs)
climin_kwargs

Climin wants an iterator of (args, kwarsg) as keyword argument "args" (to be passed to the loss function). Concretley, we use an infinite iterator over minibatches.

In [None]:
batch_size = 20
climin_args = izip(izip(chunk(batch_size, cycle(Z)), chunk(batch_size, cycle(X))), repeat({}))

In [None]:
opt = optimizer(
    identifier = "adadelta",
    args=climin_args, # repeat(((Z,X),{})),
    **climin_kwargs
)

# Visualizing model

In [None]:
# f = theano.function([model['flat']] + model['inputs'], model['outputs'], mode="FAST_COMPILE")
f = [model['flat']] + model['inputs'], model['outputs']
d3viz(f, 'tmp/model.html', match_by_names=True) #, [targets, target_distribution, predictor, params] + predictor.layers + normflows + [params_base] + Helper.all_helpers[::-1])
IFrame('tmp/model.html', width=700, height=500)

# Visualizing Loss

In [None]:
# f = theano.function([model['flat']] + model['inputs'], optimizer_kwargs['loss'], mode="FAST_COMPILE", profile=True)
f = [model['flat']] + model['inputs'], optimizer_kwargs['loss']
d3viz(f, 'tmp/loss.html')
IFrame('tmp/loss.html', width=700, height=500)

# Visualized Fit

In [None]:
line_train, = plot([], [], 'go-', label="average training loss")
line_curr_val, = plot([],[], 'bo:', label="avrg current validation loss")
line_best_val, = plot([], [], 'ko-', label="avrg best validation loss")
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
yscale('log')
ylabel("validation loss")
xlabel("#iteration")
legend(loc='lower left', fancybox=True, framealpha=0.5)

In [None]:
best_val_loss = inf
best_wrt = None
val_size = batch_size

n_whole_data = X.shape[0] // batch_size

for info in every(n_whole_data, opt):
    # collect and visualize validation loss for choosing the best model
    val_loss = optimizer_kwargs['num_loss'](opt.wrt, VZ[:val_size], VX[:val_size])/val_size
    if val_loss < best_val_loss:
        best_wrt = opt.wrt
        best_val_loss = val_loss
        add_point(line_best_val, info['n_iter'], val_loss)
    add_point(line_curr_val, info['n_iter'], val_loss)
    
    # visualize training loss for comparison:
    try:
        training_loss = info['loss'] / len(Z)  # TODO normalization needed?
    except KeyError:
        training_loss = optimizer_kwargs['num_loss'](opt.wrt, Z, X)/len(Z)
    add_point(line_train, info['n_iter'], training_loss)

# Performance

TODO: average over predictions

In [None]:
print best_val_loss
mlp['parameters_flat'] = best_wrt

predict = mlp.function()
predict(X[0]), Z[0]

In [None]:
PX = np.apply_along_axis(predict, 1, X)
PVX = np.apply_along_axis(predict, 1, VX)
PTX = np.apply_along_axis(predict, 1, TX)

In [None]:
print 'incorrect samples train/val/test:  %i/%i/%i' % (
    (PX[:, :10].argmax(1) != Z.argmax(1)).sum(),
    (PVX[:, :10].argmax(1) != VZ.argmax(1)).sum(),
    (PTX[:, :10].argmax(1) != TZ.argmax(1)).sum())

print 'error rate train/val/test:  %g/%g/%g' % (
    (PX[:, :10].argmax(1) != Z.argmax(1)).mean(),
    (PVX[:, :10].argmax(1) != VZ.argmax(1)).mean(),
    (PTX[:, :10].argmax(1) != TZ.argmax(1)).mean())