In [1]:
import numpy as np
import theano
from theano import tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
srng = RandomStreams(seed=12193)
import util

Couldn't import dot_parser, loading of dot files will not be possible.


In [8]:
sz = 6
num_steps = 5

START = T.zeros((1,6))
states = [(START, None, None)]

DUMMY_INPUT = T.zeros((1,6)) # a dummy tensor used as a place-holder
DUMMY = T.zeros((1,0))
vs = util.VariableStore()

In [9]:
def TreeLSTMLayer(lstm_prev, external_state, full_memory_dim, vs, name="tree_lstm", initializer=None, external_state_dim=0):
    # Same computation as the function in util.py, but returns more internal state.
    
    assert full_memory_dim % 2 == 0, "Input is concatenated (h, c); dim must be even."
    hidden_dim = full_memory_dim / 2

    W = vs.add_param("%s/W" % name, (hidden_dim * 2 + external_state_dim, hidden_dim * 5),
                     initializer=initializer)
    b = vs.add_param("%s/b" % name, (hidden_dim * 5,),
                     initializer=util.TreeLSTMBiasInitializer())

    def slice_gate(gate_data, i):
        return gate_data[:, i * hidden_dim:(i + 1) * hidden_dim]

    # Decompose previous LSTM value into hidden and cell value
    l_h_prev = lstm_prev[:, :hidden_dim]
    l_c_prev = lstm_prev[:, hidden_dim:2 * hidden_dim]
    r_h_prev = lstm_prev[:, 2 * hidden_dim:3 * hidden_dim]
    r_c_prev = lstm_prev[:, 3 * hidden_dim:]
    if external_state_dim == 0:
        h_prev = T.concatenate([l_h_prev, r_h_prev], axis=1)
    else:
        h_prev = T.concatenate([l_h_prev, external_state, r_h_prev], axis=1)

    # Compute and slice gate values
    gates = T.dot(h_prev, W) + b
    i_gate, fl_gate, fr_gate, o_gate, cell_inp = [slice_gate(gates, i) for i in range(5)]

    # Apply nonlinearities
    i_gate = T.nnet.sigmoid(i_gate)
    fl_gate = T.nnet.sigmoid(fl_gate)
    fr_gate = T.nnet.sigmoid(fr_gate) 
    o_gate = T.nnet.sigmoid(o_gate)
    cell_inp = T.tanh(cell_inp)
    IFOGf = T.concatenate([i_gate, fl_gate, fr_gate, o_gate, cell_inp])

    # Compute new cell and hidden value
    c_t = fl_gate * l_c_prev + fr_gate * r_c_prev + i_gate * cell_inp
    tanh_c =  T.tanh(c_t)
    h_t = o_gate * tanh_c

    return (T.concatenate([h_t, c_t], axis=1), IFOGf, tanh_c)

In [10]:
# Fprop
for _ in range(num_steps):
    states.append(TreeLSTMLayer(T.concatenate([states[-1][0], DUMMY_INPUT], axis=0), DUMMY, sz, vs))
output = states[-1][0].sum()

In [11]:
def TreeLSTMLayerGradients(W, b, IFOGf, tanhc, h_prev_l, h_prev_r, c_prev_l, c_prev_r, c, delta_h, delta_c, sz):
    # in = [ones(1, size(h_prev_l, 2), 'like', h_prev_l); h_prev_l; h_prev_r];
    inputs = T.concatenate([h_prev_l, h_prev_r])

    def slice_gate(gate_data, i):
        return gate_data[:, i * sz:(i + 1) * sz]

    # IFOGf should be the five gates, saved after the nonlinearities.
    i_gate, fl_gate, fr_gate, o_gate, cell_inp = [slice_gate(IFOGf, i) for i in range(5)]

    # tanhC = tanh(c); # should save between fprop and bprop
    tanhC = T.tanh(c)
    
    # dIFOGf(Or, :) = (tanhC .* delta_h);
    d_o = tanhC * delta_h;
    
    # dC = delta_c + (1 - tanhC .^ 2) .* IFOGf(Or, :) .* delta_h;
    d_c = delta_c + (1 - T.sqr(tanhC)) * o_gate * delta_h
    
    # dIFOGf(Flr, :) = c_prev_l .* dC;
    # dIFOGf(Frr, :) = c_prev_r .* dC;
    # delta_c_l = IFOGf(Flr, :) .* dC;
    # delta_c_r = IFOGf(Frr, :) .* dC;
    d_fl = c_prev_l * d_c
    d_fr = c_prev_r * d_c
    delta_c_l = fl_gate * d_c
    delta_c_r = fr_gate * d_c
    
    # dIFOGf(Ir, :) = IFOGf(Gr, :) .* dC;
    # dIFOGf(Gr, :) = IFOGf(Ir, :) .* dC;
    d_i = cell_inp * d_c
    d_cell_inp = i_gate * d_c

    # Backprop through nonlinearities
    # dIFOG(Gr, :) = (1 - (IFOGf(Gr, :) .^ 2)) .* dIFOGf(Gr, :);
    d_raw_cell_inp = (1 - T.sqr(cell_inp)) * d_cell_inp
    
    # y = IFOGf([Ir Flr Frr Or], :);
    # dIFOG([Ir Flr Frr Or], :) = (y .* (1.0 - y)) .* dIFOGf([Ir Flr Frr Or], :);
    def d_sigmoid(output, delta):
        return output * (1.0 - output) * delta
    d_raw_i = d_sigmoid(i_gate, d_i)
    d_raw_fl = d_sigmoid(fl_gate, d_fl)
    d_raw_fr = d_sigmoid(fr_gate, d_fr)
    d_raw_o = d_sigmoid(o_gate, d_o)
    d_IFOG = T.concatenate([d_raw_i, d_raw_fl, d_raw_fr, d_raw_o, d_raw_cell_inp], axis=1)
    
    # Compute main gradients and deltas.
    # dWLSTM = dIFOG * in';
    d_W = T.dot(d_IFOG, T.transpose(inputs))
    d_b = d_IFOG
    # dHin = WLSTM' * dIFOG;
    d_h_in = T.dot(T.transpose(W), d_IFOG)
    
    # % Compute h deltas.
    # delta_h_l = dHin(2:D + 1, :);
    # delta_h_r = dHin(D + 2:2 * D + 1, :);
    delta_h_l = d_h_in[:, 0:sz + 1]
    delta_h_r = d_h_in[:, sz + 1:2 * sz + 1]
    
    return d_W, d_b, delta_h_l, delta_h_r, delta_c_l, delta_c_r

In [12]:
# Bprop
real_grad = T.grad(output, vs.vars['tree_lstm/W'])

manual_grad = T.zeros((6, 15))
delta_h_l = T.ones((6, 1))
delta_c_l = T.zeros((6, 1))
for i in range(num_steps, 0, -1):
    d_W, d_b, delta_h_l, delta_h_r, delta_c_l, delta_c_r = TreeLSTMLayerGradients(
        vs.vars['tree_lstm/W'], vs.vars['tree_lstm/b'], 
        states[i][1], states[i][2], # IFOGf, tanh_c
        states[i - 1][0][:, 0:sz], DUMMY_INPUT[:, 0:sz], # h_prev 
        states[i - 1][0][:, sz:2 * sz], DUMMY_INPUT[:, sz:2 * sz], # c_prev 
        states[i][0][:, sz:2 * sz], delta_h_l, delta_c_l, sz) # c, delta_h, delta_c, sz    
    manual_grad += d_W

In [None]:
f = theano.function([], (T.eq(manual_grad, real_grad),
                          manual_grad, real_grad))


In [None]:
print "\n".join(str(x) for x in f()

In [9]:
theano.printing.debugprint(real_grad)
theano.printing.debugprint(f.maker.fgraph.outputs[2])
theano.printing.debugprint(T.grad(c1.sum(), w))

Elemwise{add,no_inplace} [@A] ''   
 |Elemwise{add,no_inplace} [@B] ''   
 | |Elemwise{add,no_inplace} [@C] ''   
 | | |Elemwise{add,no_inplace} [@D] ''   
 | | | |dot [@E] ''   
 | | | | |DimShuffle{1,0} [@F] 'a.T'   
 | | | | | |a [@G]
 | | | | |dot [@H] ''   
 | | | |   |dot [@I] ''   
 | | | |   | |dot [@J] ''   
 | | | |   | | |dot [@K] ''   
 | | | |   | | | |dot [@L] ''   
 | | | |   | | | | |Elemwise{second} [@M] ''   
 | | | |   | | | | | |dot [@N] ''   
 | | | |   | | | | | | |dot [@O] ''   
 | | | |   | | | | | | | |dot [@P] ''   
 | | | |   | | | | | | | | |dot [@Q] ''   
 | | | |   | | | | | | | | | |dot [@R] ''   
 | | | |   | | | | | | | | | | |dot [@S] ''   
 | | | |   | | | | | | | | | | | |a [@G]
 | | | |   | | | | | | | | | | | |W [@T]
 | | | |   | | | | | | | | | | |W [@T]
 | | | |   | | | | | | | | | |W [@T]
 | | | |   | | | | | | | | |W [@T]
 | | | |   | | | | | | | |W [@T]
 | | | |   | | | | | | |other [@U]
 | | | |   | | | | | |DimShuffle{x,x} [@V] ''   
 | | | 

NameError: name 'c1' is not defined