# RNN Example 2: Estimation

In [1]:
import numpy as np
from scipy.optimize import minimize

In [4]:
def param_toWeights(param):    
    #parameter U: x(t) -> h(t), 3 x 2
    input_weights_U = np.array([[param[0], param[1]],
                                [param[2], param[3]],
                                [param[4],param[5]]]) 
    #parameter W: x(t) -> h(t), 3 x 3
    hidden_weights_W =  np.array([[param[6], param[7], param[8]],
                                  [param[9], param[10], param[11]],
                                  [param[12], param[13], param[14]]]) 
    #parameter b: a(t)=b+W h(t-1)+Ux(t) and h(t) = tahn(a(t)), 3 x 1
    hidden_bias = np.array([[param[15]],
                            [param[16]],
                            [param[17]]]) 
    #parameter V: y(t)= c + Vh(t) 2 x 3
    output_weights_V = np.array([[param[18], param[19], param[20]],
                                 [param[21], param[22], param[23]]])
    #parameter c: y(t)= c + Vh(t) 2 x 1
    output_bias =  np.array([[param[24]],
                             [param[25]]]) 

    return input_weights_U, hidden_weights_W, hidden_bias, output_weights_V, output_bias

def RNN_CrossEntropy(param,*args):
    #Initialize weights and biases
    input_weights_U, hidden_weights_W, hidden_bias, output_weights_V, output_bias = param_toWeights(param)
    #Check what happens when we run the network using these weights?
    #Forward pass
    xs, target, hidden_states, outputs, probabilities = {}, {}, {}, {}, {}
    # cumsum of log density --> likelihood
    loss = 0 
    hidden_states[-1] = np.copy(hidden_state_prev)
    for t in range(len(inputs)): 
        # one-hot-encoding the input character 
        xs[t] = np.zeros((vocab_size,1))  
        character = inputs[t]
        xs[t][character] = 1 
        target[t] = np.zeros((vocab_size,1))  # 2 x 1
        target_character = targets[t]
        target[t][target_character] = 1
        # Compute hidden state 
        hidden_states[t] = np.tanh(input_weights_U @ xs[t] + hidden_weights_W @ hidden_states[t-1] + hidden_bias) 
        # Compute output and probabilities
        outputs[t] = output_weights_V @ hidden_states[t] + output_bias
        probabilities[t] = np.exp(outputs[t]) / np.sum(np.exp(outputs[t]))
        #Compute cross-entropy loss
        loss += - sum(target[t] * np.log(probabilities[t]))
    
    return loss

In [5]:
# ============================ # 
# RNN STARTS:
# ============================ #  
#Initialize inputs and targets
vocab_size = 2
hidden_size = 3
hidden_state_prev =  np.array([[0.95],[-0.98],[0.98]]) 
inputs = [1,0,1,0]
targets = [0,1,0,1]

#Initialize weights and biases (parameters)
#parameter U: x(t) -> h(t), 3 x 2
input_weights_U = np.random.randn(hidden_size, vocab_size) * 0.01
input_weights_U_stack = np.hstack(input_weights_U)
#parameter W: x(t) -> h(t), 3 x 3
hidden_weights_W = np.random.randn(hidden_size, hidden_size) * 0.01
hidden_weights_W_stack = np.hstack(hidden_weights_W)
#parameter b: a(t)=b+W h(t-1)+Ux(t) and h(t) = tahn(a(t)), 3 x 1
hidden_bias = np.zeros((hidden_size, 1)) 
hidden_bias_stack = np.hstack(hidden_bias)
#parameter V: y(t)= c + Vh(t) 2 x 3
output_weights_V = np.random.randn(vocab_size, hidden_size) * 0.01
output_weights_V_stack = np.hstack(output_weights_V)
#parameter c: y(t)= c + Vh(t) 2 x 1
output_bias = np.zeros((vocab_size, 1))
output_bias_stack = np.hstack(output_bias)

param0 = np.hstack((input_weights_U_stack,hidden_weights_W_stack,hidden_bias_stack,
                   output_weights_V_stack, output_bias_stack))
results = minimize(RNN_CrossEntropy, param0, method='BFGS', tol=1e-8, options={'disp': True})
param_star = results.x

# ============================ #
# PARAMETER ESTIMATES: 
# ============================ #    
input_weights_U_star, \
hidden_weights_W_star, \
hidden_bias_star, \
output_weights_V_star, \
output_bias_star = param_toWeights(param_star)

print('The optimal input weights U are:')
print(input_weights_U_star)
print('The optimal hidden state weights W are:')
print(hidden_weights_W_star)
print('The optimal hidden state bias b are:')
print(hidden_bias_star)
print('The optimal output weights V are:')
print(output_weights_V_star)
print('The optimal output bias c are:')
print(output_bias_star)

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 20
         Function evaluations: 702
         Gradient evaluations: 26
The optimal input weights U are:
[[ 0.84182004 -0.85639161]
 [-3.11145965  3.10381649]
 [-5.73631536  5.71312205]]
The optimal hidden state weights W are:
[[-0.26263566  0.33832677 -0.19223223]
 [ 1.30427041 -1.44103652  1.22418215]
 [ 2.67612822 -2.89916107  2.65021927]]
The optimal hidden state bias b are:
[[-0.03528076]
 [-0.02285487]
 [-0.02064066]]
The optimal output weights V are:
[[-0.34853607  2.96739134  7.02440513]
 [ 0.33641209 -2.94872708 -7.02584119]]
The optimal output bias c are:
[[-0.03097484]
 [ 0.03098114]]
