In [329]:
import os
import six.moves.cPickle as pickle
import gzip
import numpy as np
import torch as T
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
floatX = T.float32

In [330]:
# create shared variables for using gpu
def shared_dataset(data, borrow=True, data_types=['float32','int32']):
    if type(data) is not list:
        data = list(data)
    output = []
    for i, x in enumerate(data):
        output.append(T.as_tensor(np.asarray(x, dtype=data_types[i]), borrow=borrow))
    return output

def load_dataset(dataset):
    # get path/file for dataset
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the current directory.
        new_path = os.path.join(os.curdir, dataset)
        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
            dataset = new_path
    # download from website
    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
        from six.moves import urllib
        origin = ('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')
        print('Downloading data from %s' % origin)
        urllib.request.urlretrieve(origin, dataset)
    # load from pickle
    print('... loading data')
    with gzip.open(dataset, 'rb') as f:
        try:
            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
        except:
            train_set, valid_set, test_set = pickle.load(f)
    # set test/valid/train sets
    test_set_x, test_set_y = shared_dataset(test_set)
    valid_set_x, valid_set_y = shared_dataset(valid_set)
    train_set_x, train_set_y = shared_dataset(train_set)
    # combine datasets
    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)]
    return rval

In [957]:
class HelmholtzLayer(object):
    '''
    Helmholtz layer for Helmholtz Machine
    
    Parameters
    ----------
    input: theano matrix, input to layer (data or output of previous layer)
    n_in: int, number of input units
    n_out: int, number of hidden units
    unit: str, hidden unit type ['binary' (default) or 'gaussian']
    top_layer: bool, True/False layer is top layer
    k: float, fraction of most active units to keep; less active units are set to 0
        [default: 0, no units set to 0]
    
    Returns
    -------
    HelmholtzLayer
    
    Note
    ----
    Gaussian units assume 0 mean, 1 std (fixed sigma).
    '''
    def __init__(self, input, n_in, n_out, unit='binary', top_layer=False, k=T.tensor(0, dtype=T.int32)):
        # init vars
        self.input = input
        self.n_in = n_in
        self.n_out = n_out
        self.unit = unit
        self.top_layer = top_layer
        self.k = k
        
        # recognition weights
        self.WR = 0.01 * T.randn((n_in,n_out), dtype=floatX)
        self.WR.requires_grad=True
        # recognition biases
        self.bR = T.zeros((n_out,), dtype=floatX, requires_grad=True)
        # generative weights
        self.WG = 0.01 * T.randn((n_out,n_in), dtype=floatX)
        self.WG.requires_grad=True
        # generative biases
        self.bG = T.zeros((n_in,), dtype=floatX, requires_grad=True)
        
        # momentum
        self.WR_inc = T.zeros((n_in,n_out), dtype=floatX)
        self.bR_inc = T.zeros((n_out,), dtype=floatX)
        self.WG_inc = T.zeros((n_out,n_in), dtype=floatX) 
        self.bG_inc = T.zeros((n_in,), dtype=floatX)
        
        # if top_layer, remove shared WR, bR, WG
        if self.top_layer:
            self.WR = T.zeros_like(self.WR)
            self.bR = T.zeros_like(self.bR) 
            self.WG = T.zeros_like(self.WG) 
            # set gen_params, rec_params
            self.gen_params = [self.bG]
            self.rec_params = []
            self.inc_params = [self.bG_inc]
        else:
            # set gen_params, rec_params
            self.gen_params = [self.WG, self.bG]
            self.rec_params = [self.WR, self.bR]
            self.inc_params = [self.WG_inc, self.bG_inc, self.WR_inc, self.bR_inc]
        
        # set output
        self.set_output()
        
        # init reconstr, top_down
        self.reconstr = None
        self.top_down = None
        
    def activation(self, u, unit):
        if unit == 'binary':
            y = T.Tensor.sigmoid(u)
        elif unit == 'gaussian':
            y = u
        else: # throw error
            raise NotImplementedError
        return y
    
    def sample(self, u, unit):
        if unit == 'binary':
            y = T.bernoulli(u)
        elif unit == 'gaussian':
            y = T.add(u, T.normal(u, T.ones_like(u)))
        else: # throw error
            raise NotImplementedError
        return y
    
    def prob(self, u, x, unit):
        if unit == 'binary':
            p = T.pow(u, x) * T.pow(1. - u, 1. - x)
        elif unit == 'gaussian':
            p = (1./T.sqrt(2. * np.pi)) * T.exp(-T.pow((x - u), 2.)/2.)
        else: # throw error
            raise NotImplementedError
        return p
    
    def propup(self, v):
        pre_act_h = T.mm(v, self.WR) + self.bR
        # if k-sparse, apply
        pre_act_h = self.k_sparse(pre_act_h, self.k)
        return self.activation(pre_act_h, self.unit)
    
    def propdown(self, h):
        pre_act_v = T.mm(h, self.WG) + self.bG
        # if k-sparse, apply
        pre_act_v = self.k_sparse(pre_act_v, self.k)
        return self.activation(pre_act_v, self.unit)
    
    def sample_h_given_v(self, v):
        h_mean = self.propup(v)
        return self.sample(h_mean, self.unit)
    
    def sample_v_given_h(self, h):
        v_mean = self.propdown(h)
        return self.sample(v_mean, self.unit)
    
    def get_wake_derivs(self):
        # get delta by propagating down with output
        delta = self.propdown(self.output)
        
        # get wake derivatives
        dWG = T.div(T.mm(self.output.transpose(0,1), (self.input - delta)),
                    T.as_tensor(self.input.shape[0], dtype=floatX))
        dbG = T.mean((self.input - delta), 0)
        
        # if top_layer, no WG derivs
        if self.top_layer:
            return [dbG]
        else:
            return [dWG, dbG]
        
    def get_sleep_derivs(self):
        # if top_layer, no sleep derivs
        if self.top_layer:
            return []
        
        # get psi by propagating up with reconstr
        psi = self.propup(self.reconstr)
        
        # get sleep derivatives
        dWR = T.div(T.mm(self.reconstr.transpose(0,1), (self.top_down - psi)),
                    T.as_tensor(self.reconstr.shape[0], dtype=floatX))
        dbR = T.mean((self.top_down - psi), 0)
        return [dWR, dbR]
    
    def switch_awake(self, awake):
        # set x,y based on wake or sleep
        if awake:
            x = self.input
            y = self.output
        else:
            x = self.reconstr
            y = self.top_down
        return x, y
    
    def log_prob(self, awake):
        # get x, y
        x, y = self.switch_awake(awake)
        # get activations
        q = self.activation(self.propup(x), self.unit)
        p = self.activation(self.propdown(y), self.unit)
        # compute log probs
        if self.unit == 'binary':
            log_q = T.sum(T.add(y * T.log(q + 1e-6), (1. - y) * T.log(1. - q + 1e-6)), 1)
            log_p = T.sum(T.add(x * T.log(p + 1e-6), (1. - x) * T.log(1. - p + 1e-6)), 1)
        elif self.unit == 'gaussian':
            log_q = T.sum((-T.log(2. * np.pi) / 2.) - (T.pow((y - q), 2.) / 2.), 1)
            log_p = T.sum((-T.log(2. * np.pi) / 2.) - (T.pow((x - p), 2.) / 2.), 1)
        return log_q, log_p
        
    def k_sparse(self, x, k):
        # get threshold and repeat across axis 1
        if T.gt(k, x.shape[1]):
            k = T.tensor(0, dtype=T.int32)
        # horrible workaround for torch's dumb indexing bug
        k_mask = T.zeros((1, x.shape[1]), dtype=T.uint8)
        k_mask[0,-k] = 1
        k_mask = k_mask.repeat(x.shape[0], 1)
        thr = T.sort(x)[0][k_mask].unsqueeze(1) # T.sort crashes for some reason
        thr = thr.repeat(1, x.shape[1])
        # set values >= thr to x, values <= thr to 0
        x[T.lt(T.abs(x), thr)] = 0.
        return x
    
    def set_output(self):
        self.output = self.sample_h_given_v(self.input)
    
    def set_reconstr(self, top_down):
        self.top_down = top_down
        self.reconstr = self.sample_v_given_h(self.top_down)

In [1101]:
class HelmholtzMachine(object):
    '''
    Helmholtz machine
    
    Parameters
    ----------
    n_ins: list of ints, number of inputs for each layer
    unit: str, unit type ['binary' (default) or 'gaussian']
    k: float, fraction of most active units to keep (less active units set to 0) [default: 0]
    batch_size: int, size of mini-batch
    
    Returns
    -------
    HelmholtzMachine
    '''
    def __init__(self, n_ins, unit='binary', k=T.tensor(0, dtype=T.int32), batch_size=1):
        # init vars
        self.n_layers = len(n_ins)
        self.n_ins = n_ins
        self.unit = unit
        self.k = k
        self.batch_size = batch_size
            
        # init first layer input variable
        self.v = T.zeros((batch_size, self.n_ins[0]), dtype=floatX)
        
        # for each layer, append HelmholtzLayer
        self.params = []
        self.helmholtz_layers = []
        for n in range(self.n_layers):
            # set bG_1 to True if top layer, False otherwise
            is_top_layer = (n == self.n_layers - 1)
            # set input_layer
            if n == 0:
                input_layer = self.v
            else:
                input_layer = self.helmholtz_layers[-1].output
            # set n_out
            if is_top_layer:
                n_out = 1
            else:
                n_out = self.n_ins[n+1]
            # create helmholtz layer
            self.helmholtz_layers.append(HelmholtzLayer(input_layer, 
                                                         self.n_ins[n], 
                                                         n_out,
                                                         unit=self.unit,
                                                         top_layer=is_top_layer,
                                                         k=self.k))
            # extend params
            self.params.extend(self.helmholtz_layers[n].gen_params + self.helmholtz_layers[n].rec_params)
            
        # for each layer, set reconstr
        for n in range(self.n_layers-1, -1, -1):
            # for top layer, top_down is zeros
            if n == self.n_layers-1:
                top_down = T.zeros((1,1), dtype=floatX)
            else:
                top_down = self.helmholtz_layers[n+1].reconstr
            # set top_down and reconstr
            self.helmholtz_layers[n].set_reconstr(top_down)
    
    def forward(self, v=None):
        if v is not None:
            self.v = v
        for n in range(self.n_layers):
            if n == 0:
                self.helmholtz_layers[n].input = self.v
            else:
                self.helmholtz_layers[n].input = self.helmholtz_layers[n-1].output
            self.helmholtz_layers[n].set_output()
    
    def downward(self):
        # for each layer, set reconstr
        for n in range(self.n_layers-1, -1, -1):
            # for top layer, top_down is zeros
            if n == self.n_layers-1:
                top_down = T.zeros((1,1), dtype=floatX)
            else:
                top_down = self.helmholtz_layers[n+1].reconstr
            # set top_down and reconstr
            self.helmholtz_layers[n].set_reconstr(top_down)
    
    def model_sample(self):
        self.downward()
        return self.helmholtz_layers[0].reconstr[0]
    
    def model_prob(self):
        self.downward()
        layer0 = self.helmholtz_layers[0]
        return layer0.activation(layer0.propdown(layer0.top_down), layer0.unit)
    
    def free_energy(self, D, awake=T.tensor(1., dtype=T.int32)):
        # init FE
        FE = 0.
            
        # compute FE for each layer (log_q of the data is 0)
        FE = FE - self.helmholtz_layers[0].log_prob(awake)[1]
        for n in range(1, self.n_layers):
            log_q = self.helmholtz_layers[n-1].log_prob(awake)[0]
            log_p = self.helmholtz_layers[n].log_prob(awake)[1]
            FE = FE + (log_q - log_p)
        
        return T.sum(FE)
        
    def importance_weighting(self, log_q, log_p):
        # from Bornschein et al., 2016
        # w = sqrt(p/q)
        log_w = (log_p - log_q) / 2.
        # w_sum = sum_k(log_pq)
        log_w_max = T.max(log_w, 1, keepdims=True)
        log_w_sum = T.log(T.sum(T.exp(log_w - log_w_max), 1, keepdims=True)) + log_w_max
        # w_norm = w/w_sum
        log_w_norm = log_w - log_w_sum
        # w = exp(log_w_norm)
        return T.exp(log_w_norm)
    
    def log_likelihood(self, D, awake=T.tensor(1., dtype=T.int32)):
        # init log_qs, log_ps
        log_qs = []
        log_ps = []
            
        # get log_q, log_p for each layer
        for n in range(self.n_layers):
            log_q_n, log_p_n = self.helmholtz_layers[n].log_prob(awake)
            log_qs.append(log_q_n)
            log_ps.append(log_p_n)
            
        # sum across layers
        log_q = T.sum(log_qs, 0)
        log_p = T.sum(log_ps, 0)
        
        # reshape to (batch_size, n_samples)
        log_q = T.reshape(log_q, (D.shape[0]//self.n_samples, self.n_samples)) 
        log_p = T.reshape(log_p, (D.shape[0]//self.n_samples, self.n_samples))
        
        # get importance weights
        w = self.importance_weighting(log_q, log_p)
        
        # compute log likelihood
        log_pq = (log_p - log_q) / 2.
        log_w_sum = log_pq - T.log(w + 1e-6)
        LL = log_w_sum - T.log(self.n_samples)
        
        # cost and log likelihood
        return LL #T.sum(w * (log_p + log_q))
            
    def save_model(self, file_name):
        with open(file_name, 'wb') as f:
            pickle.dump(self, f)
        
    def show_W(self, layer_idx, W_idx=0, W_type='generative'):
        # get WG
        if W_type == 'generative':
            W = self.helmholtz_layers[layer_idx].WG.T
        else: # get WR
            W = self.helmholtz_layers[layer_idx].WR
        # get img shape
        img_shape = [int(np.sqrt(W.shape[0]))]*2
        # show W
        plt.imshow(W[:,W_idx].reshape(img_shape), cmap='gray')
        plt.show()
    
    def update_params(self, params, gparams, inc_params, lr, m=0.): 
        #
        with T.no_grad():
            for param, gparam, inc_param in zip(params, gparams, inc_params):
                inc_param *= m
                inc_param += lr * gparam
                param -= inc_param
        
    def train(self, train_data, lr=None, awake=1., m=0., k=T.tensor(0, dtype=T.int32),
              opts={'optimizer': None, 'momentum': False, 'k_sparse': False, 
                    'n_samples': 1, 'autograd': True}):
        '''WRITEME'''
        # set vars
        if lr is None and 'lr' in opts:
            lr = opts['lr']
        if not opts['autograd']:
            self.v = train_data
        elif 'optimizer' in opts:
            optimizer = opts['optimizer']
        if opts['k_sparse']:
            self.k = T.tensor(k, dtype=T.int32)
        if 'n_samples' in opts:
            self.n_samples = opts['n_samples']
        
        # get grad_fn and cost_fn from opts
        if 'cost_fn' in opts:
            cost_fn = opts['cost_fn']
            # get cost
            cost = cost_fn(train_data, awake)
        else:
            cost = None
        if 'grad_fn' in opts:
            grad_fn = opts['grad_fn']
        else:
            return cost
        
        # forward pass if awake, downward pass if asleep
        if awake:
            helm.forward(train_data)
        else:
            helm.downward()
        
        # get wake/sleep derivatives
        for n in range(self.n_layers):
            # set params
            if awake:
                params = self.helmholtz_layers[n].gen_params
                inc_params = self.helmholtz_layers[n].inc_params[:2]
            else:
                params = self.helmholtz_layers[n].rec_params
                inc_params = self.helmholtz_layers[n].inc_params[2:]
            if opts['autograd']: # automatic derivatives
                grad_fn(train_data, awake).backward(retain_graph=True)
                # update params if no optimizer
                if 'optimizer' not in opts:
                    gparams = T.autograd.grad(grad_fn(train_data, awake), params, retain_graph=True)
                    self.update_params(params, gparams, inc_params, lr, m)
            elif awake: # manual wake derivatives
                gparams = self.helmholtz_layers[n].get_wake_derivs()
                # update params
                self.update_params(params, gparams, inc_params, lr, m)
            elif not awake: # manual sleep derivatives
                gparams = self.helmholtz_layers[n].get_sleep_derivs()
                # update params
                self.update_params(params, gparams, inc_params, lr, m)
        
        # take step with optimizer
        if opts['autograd'] and 'optimizer' in opts:
            optimizer.step()
                    
        return cost
        
    def set_opts(self, batch_size, grad_fn='free_energy', cost_fn='free_energy',
                       opts={'momentum': False, 'k_sparse': False, 'lr': 1e-3,
                             'n_samples': 1, 'autograd': True}):
        '''WRITEME
        Create training function with given update rule, momentum, and sparsity
        
        Parameters
        ----------
        train_data: theano matrix, training data to use
        batch_size: int, size of each training mini-batch
        cost_type: str, cost type to use for update rule ['free_energy' (default) or 'log_likelihood']
        opts: dict, options for momentum, sparsity, importance sampling, and automatic differentiation
            'momentum': bool, True/False use momentum [default: False]
            'k_sparse': bool, True/False to do k-sparse (includes new input for train_fn) [default: False]
            'n_samples': int, number of samples to use for importance sampling [default: 1] (cost_type='loglikelihood')
            'autograd': bool, True/False use automatic differentiation [default: True] (cost_type='free_energy')
            
        Returns
        -------
        train_fn: theano function, training function with following inputs
            idx: int, index of training mini-batch (i.e. train_data[idx*batch_size:(idx+1)*batch_size])
            awake: bool, True/False awake for wake/sleep algorithm
            m: float, momentum to apply (if opts['momentum'] == True)
            k: float, fraction of most active units to keep (less active set to 0; if opts['k_sparse'] == True)
            *lr: float(s), learning rates for each layer
            
        Note
        ----
        In order to use k-sparse, when initializing HelmholtzMachine set k=T.scalar('k').
        '''
        # init vars
        self.batch_size = batch_size
        
        # init momentum, k_sparse, lr
        if 'momentum' not in opts:
            opts['momentum'] = False
        if 'k_sparse' not in opts:
            opts['k_sparse'] = False
        if 'lr' not in opts:
            opts['lr'] = 1e-3
        
        # n_samples (add to self so log_likelihood() has access)
        if 'n_samples' in opts and (grad_fn == 'log_likelihood' or cost_fn == 'log_likelihood'):
            self.n_samples = opts['n_samples']
        else:
            self.n_samples = 1
            
        # get gradient function
        if grad_fn == 'free_energy':
            opts['grad_fn'] = self.free_energy
        elif grad_fn == 'log_likelihood':
            opts['cost_fn'] = self.log_likelihood
            opts['autograd'] = True
        
        # set optimizer
        if opts['autograd']:
            opts['optimizer'] = T.optim.SGD(helm.params, lr=opts['lr'])
            opts['optimizer'].zero_grad()
        
        # get cost function
        if cost_fn == 'free_energy':
            opts['cost_fn'] = self.free_energy
        elif cost_fn == 'log_likelihood':
            opts['cost_fn'] = self.log_likelihood
        
        return opts

In [981]:
# create bar data
data = np.zeros((1000, 3, 3))
for n in range(data.shape[0]):
    # Horizontal bars 1/3, vertical bars 2/3
    if np.random.rand() > (2./3.): 
        data[n, np.random.randint(data.shape[1]), :] = 1.
    else:
        data[n, :, np.random.randint(data.shape[2])] = 1.
data = np.reshape(data, (data.shape[0], -1))
data = T.as_tensor(data, dtype=floatX)

In [1102]:
helm = HelmholtzMachine([9,6,1])

In [1103]:
opts = helm.set_opts(1, opts={'autograd': False, 'lr': 1e-5})

In [1104]:
cost = []
for epoch in range(1000):
    # train
    helm.train(data[None,0], awake=1, lr=1e-2, opts=opts)
    cost.append(helm.train(data[None,0], awake=0, lr=1e-2, opts=opts))
    # print cost
    print(cost[-1], end='\r')

tensor(0.3437, grad_fn=<SumBackward0>)