In [1]:
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))

* http://colah.github.io/posts/2015-08-Understanding-LSTMs/

* http://arxiv.org/abs/1409.2329

![title](imgs/LSTM3-chain.png)

![title](imgs/LSTM2-notation.png)

In [2]:
data_x1 = np.array([[0., 1.], [1, 0], [1, 1], [0, -1], [-1, 0], [-1, -1], [-1,-1]])
data_x2 = np.array([[1., 0.], [-1, 1], [0, -2], [2, 1], [1, -1], [2, 2], [1, 1]])


data_dim =  data_x1.shape[1]
n_inputs = data_x1.shape[0]
n_nodes = 3
n_steps = 2

weights_dim = [data_dim + n_nodes, n_nodes]

lt_memory0 = np.zeros([n_inputs, n_nodes])
st_memory0 = np.zeros([n_inputs, n_nodes])

lt_memory0

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

## Forget gate layer

![title](imgs/LSTM3-focus-f.png)

In [3]:
np.random.seed(666)

# Wf = np.zeros(weights_dim) # forget half
# Wf = np.ones(weights_dim) * 100 # do not forget anything
# Wf = np.ones(weights_dim) * (-100) # do not forget anything
Wf = 2 * np.random.rand(*weights_dim) - 1
bf = 2*np.random.rand(n_nodes) - 1

def gateLayer(x, h, W, b):
    return sigmoid(np.matmul(np.concatenate([x, h], axis=1), W) + b)

forget_gate1 = gateLayer(data_x1, st_memory0, Wf, bf)
forget_gate1

array([[ 0.73197675,  0.47829704,  0.14818873],
       [ 0.72108162,  0.42521307,  0.39621623],
       [ 0.803063  ,  0.64600098,  0.19847554],
       [ 0.52329382,  0.13093712,  0.54991046],
       [ 0.53695366,  0.15733985,  0.24465725],
       [ 0.42369298,  0.0703666 ,  0.46189409],
       [ 0.42369298,  0.0703666 ,  0.46189409]])

## Input gate layer & tanh layer

![title](imgs/LSTM3-focus-i.png)

In [4]:
Wi = 2 * np.random.rand(*weights_dim) - 1
bi = 2*np.random.rand(n_nodes)-1

WC = 2 * np.random.rand(*weights_dim) - 1
bC = np.zeros(n_nodes)
bC = 2*np.random.rand(n_nodes)-1

def tanhLayer(x, h, WC, bC):
    return np.tanh(np.matmul(np.concatenate([x, h], axis=1), WC) + bC)

input_gate1 = gateLayer(data_x1, st_memory0, Wi, bi)
print(input_gate1)

tanh_gate1 = tanhLayer(data_x1, st_memory0, WC, bC)
print(tanh_gate1)

[[ 0.63398086  0.38071698  0.47031014]
 [ 0.33550248  0.33821934  0.38522399]
 [ 0.44305822  0.27068111  0.25494187]
 [ 0.41097131  0.53826357  0.74858656]
 [ 0.70532456  0.58372589  0.80839608]
 [ 0.60303837  0.65881495  0.88540241]
 [ 0.60303837  0.65881495  0.88540241]]
[[-0.91020881 -0.84627754 -0.30849614]
 [-0.84210478 -0.84863104  0.25236998]
 [-0.94571281 -0.97001537  0.2274545 ]
 [-0.3853925   0.41343097 -0.25990321]
 [-0.60863589  0.42033073 -0.68730486]
 [-0.14455062  0.85897178 -0.67310054]
 [-0.14455062  0.85897178 -0.67310054]]


## Update Long time memory
![title](imgs/LSTM3-focus-C.png)

In [5]:
lt_memory1 = forget_gate1 * lt_memory0 + input_gate1 * tanh_gate1
lt_memory1

array([[-0.57705497, -0.32219223, -0.14508886],
       [-0.28252824, -0.28702343,  0.09721897],
       [-0.41900584, -0.26256484,  0.05798767],
       [-0.15838526,  0.22253483, -0.19456005],
       [-0.42928584,  0.24535793, -0.55561455],
       [-0.08716957,  0.56590345, -0.59596484],
       [-0.08716957,  0.56590345, -0.59596484]])

## Output gate and generate output

![title](imgs/LSTM3-focus-o.png)

In [6]:
Wo = 2 * np.random.rand(*weights_dim) - 1
bo = 2*np.random.rand(n_nodes)-1

output_gate1 = gateLayer(data_x1, st_memory0, Wo, bo)
print(output_gate1)

output1 = output_gate1 * np.tanh(lt_memory1)
print(output1)

st_memory1 = output1

[[ 0.35988304  0.63548882  0.48129936]
 [ 0.59170634  0.64016035  0.63165424]
 [ 0.55258817  0.75549002  0.61489521]
 [ 0.43632481  0.36626744  0.51697597]
 [ 0.2309443   0.36158622  0.36674008]
 [ 0.26055295  0.2459118   0.38347041]
 [ 0.26055295  0.2459118   0.38347041]]
[[-0.18732691 -0.19794702 -0.06934527]
 [-0.16286327 -0.17885625  0.06121604]
 [-0.21887607 -0.19392896  0.03561643]
 [-0.06853529  0.08018793 -0.09933266]
 [-0.09346877  0.08697961 -0.18509972]
 [-0.02265494  0.12599153 -0.20483916]
 [-0.02265494  0.12599153 -0.20483916]]


In [7]:
forget_gate2 = gateLayer(data_x2, st_memory1, Wf, bf)
print(forget_gate2)

[[ 0.73528559  0.48961272  0.41596747]
 [ 0.64370059  0.3783727   0.11063752]
 [ 0.41346708  0.07816546  0.77566993]
 [ 0.86767698  0.7796711   0.28746087]
 [ 0.65143778  0.22331033  0.67855644]
 [ 0.91636119  0.88649464  0.13551464]
 [ 0.82307373  0.61399485  0.22592079]]


In [8]:
input_gate2 = gateLayer(data_x2, st_memory1, Wi, bi)
print(input_gate2)

tanh_gate2 = tanhLayer(data_x2, st_memory1, WC, bC)
print(tanh_gate2)

[[ 0.31103588  0.28881093  0.38818704]
 [ 0.79049644  0.46203379  0.71612819]
 [ 0.29471608  0.55980818  0.85471167]
 [ 0.23526283  0.17177901  0.10700031]
 [ 0.19775158  0.38391936  0.49576487]
 [ 0.31200378  0.13376138  0.05694747]
 [ 0.38525131  0.26047344  0.22296482]]
[[-0.78432438 -0.88253184  0.22512306]
 [-0.77387942 -0.48696492 -0.72859097]
 [ 0.39736558  0.80327057 -0.26187089]
 [-0.97100529 -0.99465661  0.72107866]
 [-0.64844727 -0.40564165  0.42313858]
 [-0.99259941 -0.9988933   0.72326985]
 [-0.96227947 -0.96793572  0.37180929]]


In [9]:
lt_memory2 = forget_gate2 * lt_memory1 + input_gate2 * tanh_gate2
print(lt_memory2)

[[-0.66825323 -0.41263426  0.02703761]
 [-0.79361252 -0.33359608 -0.51100847]
 [-0.05613509  0.42915393 -0.17884481]
 [-0.3658687   0.00264284  0.02122724]
 [-0.40788449 -0.10094272 -0.16723859]
 [-0.38957358  0.36805703 -0.03957357]
 [-0.44246641  0.09534026 -0.05174046]]


In [10]:
output_gate2 = gateLayer(data_x2, st_memory1, Wo, bo)
print(input_gate2)

output2 = output_gate2 * np.tanh(lt_memory2)
print(output2)

st_memory2 = output2

[[ 0.31103588  0.28881093  0.38818704]
 [ 0.79049644  0.46203379  0.71612819]
 [ 0.29471608  0.55980818  0.85471167]
 [ 0.23526283  0.17177901  0.10700031]
 [ 0.19775158  0.38391936  0.49576487]
 [ 0.31200378  0.13376138  0.05694747]
 [ 0.38525131  0.26047344  0.22296482]]
[[-0.36193563 -0.24914643  0.01671874]
 [-0.1327664  -0.15388951 -0.16010515]
 [-0.02722515  0.09417158 -0.09162839]
 [-0.26709286  0.00222109  0.01537898]
 [-0.26673895 -0.05050129 -0.10521365]
 [-0.27792859  0.31966786 -0.02818019]
 [-0.25550788  0.07233085 -0.03139437]]


In [11]:
numpy_output = [output1, output2]

# TF version

See 25TensorflowExperimentsWithLSTMOneStep first

In [12]:
import tensorflow as tf
lstm_weights = np.concatenate([Wi, WC, Wf, Wo], axis=1)
lstm_biases = np.concatenate([bi, bC, bf, bo])

In [13]:
inputs = tf.placeholder(tf.float32, [n_steps, n_inputs, data_dim])
inputs_seq = tf.unstack(inputs)
initial_ltm = tf.placeholder(tf.float32, [n_inputs, n_nodes])
initial_stm = tf.placeholder(tf.float32, [n_inputs, n_nodes])

with tf.name_scope('rec_layer'):

    lstm_rnn_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(
        num_units=n_nodes, forget_bias=0,)

    lstm_rnn_output, state = tf.contrib.rnn.static_rnn(
        cell=lstm_rnn_cell, 
        initial_state=[initial_ltm, initial_stm], 
        inputs=inputs_seq, 
        dtype=tf.float32)

In [14]:
lstm_weights_var = [v for v in tf.trainable_variables() if v.name == 'rnn/basic_lstm_cell/weights:0'][0]
assign_lstm_weights = tf.assign(lstm_weights_var, lstm_weights)

lstm_biases_var = [v for v in tf.trainable_variables() if v.name == 'rnn/basic_lstm_cell/biases:0'][0]
assign_lstm_biases = tf.assign(lstm_biases_var, lstm_biases)

feed_dict = {inputs: np.array([data_x1, data_x2]), initial_ltm:lt_memory0, initial_stm: st_memory0}

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init) 
    sess.run(assign_lstm_weights)
    sess.run(assign_lstm_biases)
    
    variables_names = [v.name for v in tf.trainable_variables()]
    values = sess.run(variables_names)
    lstm_out, st = sess.run([lstm_rnn_output, state], feed_dict)
    print(st)  

LSTMStateTuple(c=array([[-0.66825318, -0.41263425,  0.02703761],
       [-0.79361254, -0.33359611, -0.5110085 ],
       [-0.05613507,  0.42915395, -0.17884482],
       [-0.36586872,  0.00264281,  0.02122722],
       [-0.40788448, -0.10094272, -0.16723856],
       [-0.38957357,  0.36805701, -0.03957358],
       [-0.44246644,  0.09534025, -0.05174047]], dtype=float32), h=array([[-0.36193559, -0.2491464 ,  0.01671874],
       [-0.13276641, -0.15388954, -0.16010515],
       [-0.02722514,  0.09417158, -0.0916284 ],
       [-0.26709288,  0.00222106,  0.01537897],
       [-0.26673889, -0.05050128, -0.10521362],
       [-0.27792859,  0.31966782, -0.02818018],
       [-0.25550792,  0.07233085, -0.03139438]], dtype=float32))


In [15]:
print('np:')
print(numpy_output)
print('tf:')
print(lstm_out)

np:
[array([[-0.18732691, -0.19794702, -0.06934527],
       [-0.16286327, -0.17885625,  0.06121604],
       [-0.21887607, -0.19392896,  0.03561643],
       [-0.06853529,  0.08018793, -0.09933266],
       [-0.09346877,  0.08697961, -0.18509972],
       [-0.02265494,  0.12599153, -0.20483916],
       [-0.02265494,  0.12599153, -0.20483916]]), array([[-0.36193563, -0.24914643,  0.01671874],
       [-0.1327664 , -0.15388951, -0.16010515],
       [-0.02722515,  0.09417158, -0.09162839],
       [-0.26709286,  0.00222109,  0.01537898],
       [-0.26673895, -0.05050129, -0.10521365],
       [-0.27792859,  0.31966786, -0.02818019],
       [-0.25550788,  0.07233085, -0.03139437]])]
tf:
[array([[-0.18732689, -0.19794703, -0.06934525],
       [-0.16286328, -0.17885621,  0.06121602],
       [-0.21887606, -0.19392894,  0.03561642],
       [-0.06853528,  0.08018792, -0.09933265],
       [-0.09346876,  0.0869796 , -0.18509974],
       [-0.02265495,  0.12599154, -0.20483917],
       [-0.02265495,  0.12