In [None]:
import numpy as np
from modules import *

In [None]:
class LstmParameters:
    def __init__(self, ct_dim, x_dim):
        """
        
        initialize all weights and biases 
        ct_dim - the dimension of current cell state (Ct) matrix
        x_dim - the dimension of Tth input
        
        Weight and bias matrices are initialized with random values instead of zeroes to add noise. 
        Their derivatives will thus be zero.
        
        Terminology:
        
        Prefixes
        w - a weight matrix
        b - a bias matrix
        d - a derivative matrix
        
        Suffixes
        c - cell state gate. Represents data held in current cell. 
            It is the c't (c bar t) matrix
        i - input gate
        f - forget gate
        o - output gate        
        
        """
        self.ct_dim = ct_dim  
        self.x_dim = x_dim 
        combined_dim = x_dim + ct_dim
        
        # Initialize weights
        self.wc = rand_arr(-0.1, 0.1, ct_dim, combined_dim)
        self.wi = rand_arr(-0.1, 0.1, ct_dim, combined_dim) 
        self.wf = rand_arr(-0.1, 0.1, ct_dim, combined_dim)
        self.wo = rand_arr(-0.1, 0.1, ct_dim, combined_dim)
        
        #Initialize biases 
        self.bc = rand_arr(-0.1, 0.1, ct_dim) 
        self.bi = rand_arr(-0.1, 0.1, ct_dim) 
        self.bf = rand_arr(-0.1, 0.1, ct_dim) 
        self.bo = rand_arr(-0.1, 0.1, ct_dim) 
        
        # Initialize derivatives
        self.dwc = np.zeros((ct_dim, combined_dim)) 
        self.dwi = np.zeros((ct_dim, combined_dim)) 
        self.dwf = np.zeros((ct_dim, combined_dim)) 
        self.dwo = np.zeros((ct_dim, combined_dim)) 
        self.dbc = np.zeros(ct_dim) 
        self.dbi = np.zeros(ct_dim) 
        self.dbf = np.zeros(ct_dim) 
        self.dbo = np.zeros(ct_dim)
        
    def apply_derivatives(self, alpha = 1):
        """
        Update parameters in each iteration to reach optimal value 
        alpha is the learning rate.
        """
        self.wc -= alpha * self.dwc
        self.wi -= alpha * self.dwi
        self.wf -= alpha * self.dwf
        self.wo -= alpha * self.dwo
        self.bc -= alpha * self.dbc
        self.bi -= alpha * self.dbi
        self.bf -= alpha * self.dbf
        self.bo -= alpha * self.dbo
        
        # reset all derivatives to zero
        self.dwc = np.zeros_like(self.wc)
        self.dwi = np.zeros_like(self.wi) 
        self.dwf = np.zeros_like(self.wf) 
        self.dwo = np.zeros_like(self.wo) 
        self.dbc = np.zeros_like(self.bc)
        self.dbi = np.zeros_like(self.bi) 
        self.dbf = np.zeros_like(self.bf) 
        self.dbo = np.zeros_like(self.bo) 

In [None]:
class LstmCellState:
    def __init__(self, ct_dim, x_dim):
        """
        Initialize all gate matrices. 
        All gate matrices have the same dimension as ct matrix
        c - current hidden cell state. The gate corresponding to this determines how much 
            data of previous cell should be read in current cell.
        i - input gate. Determines how much data should be read into the cell from current input.
        f - forget gate. Determines how much data should be forgotten, i.e discarded
        o - output gate. How much data to output from current cell to next cell
        s - The present state of gate. 
            Equation to calculate present state : forget_gate*previous_state(s<t-1>) + c_gate*input_gate
        h - output state of the cell. It is the prediction value of Tth output in series.
        dh - derivative of h
        ds - derivative of s
        """
        self.c = np.zeros(ct_dim)
        self.i = np.zeros(ct_dim)
        self.f = np.zeros(ct_dim)
        self.o = np.zeros(ct_dim)
        self.s = np.zeros(ct_dim)
        self.h = np.zeros(ct_dim)
        self.dh = np.zeros_like(self.h)
        self.ds = np.zeros_like(self.s)