In [3]:
# Word Vector Representation
import numpy as np

# Use null token to ensure each input sentence has the same length
raw_txt_inputs = ['hello world i am calvin', 'calvin says hello world <null>']
word_to_idx = dict()

idx = 0
for sentence in raw_txt_inputs:
    for word in sentence.split():
        if word_to_idx.get(word) is None:
            word_to_idx[word] = idx
            idx += 1

# Create a weight matrix for mapping word to its word vector representation
vocab_size = len(word_to_idx)
word_vec_dim = 5
word_embedding_weight = np.random.rand(vocab_size, word_vec_dim)

# Convert raw_txt_input to tensor representation
index_sequences = []
for sentence in raw_txt_inputs:
    seq = []
    for word in sentence.split():
        seq.append(word_to_idx[word])
    index_sequences.append(seq)

input_sequences = word_embedding_weight[np.array(index_sequences)]

print(input_sequences)
print('\nInput sequence has shape', input_sequences.shape)

[[[0.46391225 0.81681318 0.9856427  0.99566879 0.18224726]
  [0.19485853 0.18646619 0.95706079 0.41282122 0.25447514]
  [0.88389975 0.23759714 0.59183858 0.75197443 0.93150512]
  [0.04709183 0.00130502 0.15083255 0.51619873 0.38410871]
  [0.22399013 0.82668565 0.2677109  0.31410795 0.78524972]]

 [[0.22399013 0.82668565 0.2677109  0.31410795 0.78524972]
  [0.35185803 0.64860544 0.06151891 0.9797043  0.82781468]
  [0.46391225 0.81681318 0.9856427  0.99566879 0.18224726]
  [0.19485853 0.18646619 0.95706079 0.41282122 0.25447514]
  [0.15984876 0.40287915 0.42810361 0.703509   0.80318853]]]

Input sequence has shape (2, 5, 5)


In [4]:
# Forward Step & Sequence Example

def _forward_step(self, x, prev_hidden_state, prev_cell_state):
    """Forward pass for a single time step of the LSTM layer.

    :param np.array x: Input data of shape (N, D)
    :param np.array prev_hidden_state: Previous hidden state of shape (N, H)
    :param np.array prev_cell_state: Previous cell state of shape (N, H)

    Returns tuple:
        - next_hidden_state: Next hidden state, of shape (N, H)
        - next_cell_state: Next cell state, of shape (N, H)
        - cache: Tuple of values needed for back-propagation
    """
    _, H = prev_hidden_state.shape

    # Compute activations
    acts = np.dot(x, self.Wx) + np.dot(prev_hidden_state, self.Wh) + self.b

    # Compute the internal gates
    input_gate = sigmoid(acts[:, 0:H])
    forget_gate = sigmoid(acts[:, H:2*H])
    output_gate = sigmoid(acts[:, 2*H:3*H])
    gain_gate = np.tanh(acts[:, 3*H:4*H])

    # Compute next states
    next_cell_state = forget_gate * prev_cell_state + input_gate * gain_gate
    next_hidden_state = output_gate * np.tanh(next_cell_state)

    # Cache the results
    cache = {
        'x': x,
        'next-c': next_hidden_state,
        'next-h': next_cell_state,
        'i-gate': input_gate,
        'f-gate': forget_gate,
        'o-gate': output_gate,
        'g-gate': gain_gate,
        'prev-h': prev_hidden_state,
        'prev-c': prev_cell_state
    }

    return next_hidden_state, next_cell_state, cache


def forward(self, input_sequence, h0, Wx=None, Wh=None, b=None):
    """Forward pass for a LSTM layer over an entire sequence of data.
    This assumes an input sequence composed of T vectors, each of dimension D.
    The LSTM uses a hidden size of H, and it works over a mini-batch containing N sequences.

    :param np.array input_sequence: Input data of shape (N, T, D)
    :param np.array h0: Initial hidden state of shape (N, H)
    :param np.array Wx: Optional input-to-hidden weight matrix, of shape (D, 4H)
    :param np.array Wh: Optional hidden-to-hidden weight matrix, of shape (H, 4H)
    :param np.array b: Optional bias vector, of shape (4H,)

    Returns np.array:
        Hidden state over time of shape (N, T, H)
    """
    if Wx is not None and Wh is not None and b is not None:
        self.Wx, self.Wh, self.b = Wx, Wh, b

    N, T, D = input_sequence.shape
    _, H = h0.shape

    # Cache the inputs and create time series variables,
    # i.e. hidden states over time and cell states over time.
    self.input_sequence = input_sequence
    self.h0 = h0

    self.hidden_states_over_t = np.zeros((N, T, H))
    self.cell_states_over_t = np.zeros((N, T, H))
    self.caches = dict()

    # Run the sequence
    prev_hidden_state = h0
    prev_cell_state = np.zeros(h0.shape)
    for t in range(T):
        hidden_state, cell_state, self.caches[t] = self._forward_step(input_sequence[:, t, :],
                                                                     prev_hidden_state,
                                                                     prev_cell_state)
        self.hidden_states_over_t[:, t, :] = hidden_state
        self.cell_states_over_t[:, t, :] = cell_state

        prev_hidden_state, prev_cell_state = hidden_state, cell_state

    return self.hidden_states_over_t


# Backprop Step & Sequence

def _backward_step(self, grad_next_hidden_state, grad_next_cell_state, cache):
    """Backward pass for a single time step of the LSTM layer.

    Args:
        grad_next_hidden_state (np.array): Gradient of next hidden state, of shape (N, H)
        grad_next_cell_state (np.array): Gradient of next cell state, of shape (N, H)
        cache (tuple): Cache object from the forward pass

    Returns tuple:
        - grad_x: Gradients of time step input, of shape (N, D)
        - grad_prev_hidden_state: Gradients of previous hidden state, of shape (N, H)
        - grad_prev_cell_state: Gradients of previous cell state, of shape (N, H)
        - grad_Wx: Gradients of input-to-hidden weights, of shape (D, 4H)
        - grad_Wh: Gradients of hidden-to-hidden weights, of shape (H, 4H)
        - grad_b: Gradients of bias, of shape (4H,)
    """
    x, _, next_c, i_gate, f_gate, o_gate, g_gate, prev_h, prev_c = cache

    # Note that grad_prev_c has two contributions, one from grad_next_cell_state and another one from
    # grad_next_hidden_state
    grad_next_h_next_c = o_gate * ( 1 - (np.tanh(next_c) * np.tanh(next_c)))

    grad_prev_cell_state = (grad_next_hidden_state * grad_next_h_next_c * f_gate) + (grad_next_cell_state * f_gate)

    # Each gate needs to go through the derivative of non-linearity
    grad_i_gate = (grad_next_hidden_state * grad_next_h_next_c * g_gate) + (grad_next_cell_state * g_gate)
    grad_i_gate = grad_i_gate * i_gate * (1 - i_gate)

    grad_f_gate = (grad_next_hidden_state * grad_next_h_next_c * prev_c) + (grad_next_cell_state * prev_c)
    grad_f_gate = grad_f_gate * f_gate * (1 - f_gate)

    grad_o_gate = grad_next_hidden_state * np.tanh(next_c)
    grad_o_gate = grad_o_gate * o_gate * (1 - o_gate)

    grad_g_gate = (grad_next_hidden_state * grad_next_h_next_c * i_gate) + (grad_next_cell_state * i_gate)
    grad_g_gate = grad_g_gate * (1 - g_gate * g_gate)

    # Now stack them
    grad_act = np.concatenate((grad_i_gate, grad_f_gate, grad_o_gate, grad_g_gate), axis=1)

    # And then do the same ol' gradient calculations
    grad_x = np.dot(grad_act, self.Wx.T)
    grad_prev_hidden_state = np.dot(grad_act, self.Wh.T)
    grad_Wx = np.dot(x.T, grad_act)
    grad_Wh = np.dot(prev_h.T, grad_act)
    grad_b = np.sum(grad_act, axis=0)

    return grad_x, grad_prev_hidden_state, grad_prev_cell_state, grad_Wx, grad_Wh, grad_b


def backward(self, grad_hidden_state_over_t):
    """Backward pass for a LSTM layer over an entire sequence of data.

    Args:
        grad_hidden_state (np.array): Upstream gradients of hidden states, of shape (N, T, H)

    Returns tuple:
        - grad_input_seq: Gradient of the input data, of shape (N, T, D)
        - grad_h0: Gradient of the initial hidden state, of shape (N, H)
        - grad_Wx: Gradient of input-to-hidden weight matrix, of shape (D, 4H)
        - grad_Wh: Gradient of hidden-to-hidden weight matrix, of shape (H, 4H)
        - grad_b: Gradient of biases, of shape (4H,)
    """
    N, T, H = grad_hidden_state_over_t.shape
    # grad_cell_state_over_t = np.zeros((N, T, H))

    grad_input_seq = np.zeros(self.input_sequence.shape)
    grad_Wx, grad_Wh, grad_b = np.zeros(self.Wx.shape), np.zeros(self.Wh.shape), np.zeros(self.b.shape)
    grad_prev_hidden_state = np.zeros((N, H))
    grad_prev_cell_state = np.zeros((N, H))

    for t in reversed(range(T)):
        time_step_result = self._backward_step(grad_hidden_state_over_t[:, t, :] + grad_prev_hidden_state,
                                               grad_prev_cell_state,
                                               self.caches[t])
        grad_input_seq[:, t, :] = time_step_result[0]
        grad_prev_hidden_state = time_step_result[1]
        grad_prev_cell_state = time_step_result[2]

        # Accumulate
        grad_Wx += time_step_result[3]
        grad_Wh += time_step_result[4]
        grad_b += time_step_result[5]

    # Gradient of the initial hidden state is the last grad_prev_hidden_state
    grad_h0 = grad_prev_hidden_state

    return grad_input_seq, grad_h0, grad_Wx, grad_Wh, grad_b


# The softmax score is used to classify word vectors into the appropriate word index.

import matplotlib.pyplot as plt
%matplotlib inline

from rnn.lstm_recurrent_model import LSTMRecurrentModel
from rnn.lstm_solver import LSTMSolver
from rnn.data_util import load_word_based_text_input

x_filepath = 'rnn/datasets/questions.txt'
y_filepath = 'rnn/datasets/answers.txt'
seq_length = 30

questions, answers, word_to_idx, idx_to_word = load_word_based_text_input(x_filepath, y_filepath, seq_length)

feed_dict = {
    'training_x': questions,
    'training_y': answers
}

model = LSTMRecurrentModel(word_to_idx, idx_to_word)


# run a solver with Adam optimizer to train the model

solver = LSTMSolver(model, feed_dict=feed_dict,
                           batch_size=10,
                           num_epochs=300,
                           print_every=10,
                           learning_rate_decay=0.99,
                           update_rule='adam',
                           verbose=False)
iters, losses = solver.train()

plt.plot(iters, losses)
plt.show()
plt.clf()


from rnn.data_util import convert_string_to_index_matrix

input_sequence = convert_string_to_index_matrix("What did you eat?", word_to_idx)
model.sample(input_sequence)

input_sequence = convert_string_to_index_matrix("Where is the sky?", word_to_idx)
model.sample(input_sequence)

ModuleNotFoundError: No module named 'rnn'