In [52]:
import numpy as np

In [53]:
import sys
sys.path.append("..\\NN\\")

In [54]:
from activation import Sigmoid, ReLU
from loss import BinaryCrossEntropy
from optimizer import GradientDescentOptimizer
from layer import Dense
from model import Sequential

In [90]:
class BasicRNN:

    def __init__(self, input_size, output_activation, output_units, hidden_units):
        """
        input_size: dimension of 1 sample: (timestep, features)
        activation: activation of output layer
        output_units: no. of neurons in output layer
        hidden_units: no. of neurons in hidden layer
        """
        if isinstance(input_size, tuple) and len(input_size)!=2:
            raise ValueError(f"Incompatible input shape, got {input_size}")
            
        self.output_activation = output_activation
        self.hidden_units = hidden_units
        self.output_units = output_units
        self.input_units = input_size[1] # f
        self.timestep = input_size[0]
        
        self.hidden_layer = Dense(units=hidden_units, activation=ReLU(), input_size=self.hidden_units+self.input_units)
        self.output_layer = Dense(units=output_units, activation=self.output_activation, input_size=hidden_units)
        
        
    def eval(self, X, start_sequence=None): # m,t,f
        ht_1 = np.zeros(( X.shape[0], self.hidden_units))
        if start_sequence != None:
            ht_1 = start_sequence
        y = np.empty((X.shape[0], self.timestep, self.output_units)) # m, t, output_units
        for i in range(self.timestep):
            xt_stacked = np.concatenate([X[:,i,:],ht_1], axis=1)
            ht = self.hidden_layer.eval(xt_stacked)
            y[:,i,:] = self.output_layer.eval(ht)
        return y
    
    def get_parameter_shape(self):
        ## wh, wo, bh, bo
        """ returns shape of Wh, Wf"""
        wh_shape, bh_shape = self.hidden_layer.dot.get_parameter_shape()
        wo_shape, bo_shape = self.output_layer.dot.get_parameter_shape()
        return wh_shape, wo_shape
    
    def get_total_parameters(self):
        wh_shape, bh_shape = self.hidden_layer.dot.get_parameter_shape()
        wo_shape, bo_shape = self.output_layer.dot.get_parameter_shape()
        return np.prod(wh_shape) + np.prod(bh_shape) + np.prod(wo_shape) + np.prod(bo_shape)

    def get_output_size(self):
        """ returns output shape corresponding to just 1 sample"""
        return self.timestep, self.output_units

    def grad_parameters_t(self, xt, ht_1):
        """ computes dyt_dwo, dyt_dbo, dyt_dwh, dyt_dbh """
        xt_stacked = np.concatenate([xt,ht_1], axis=1)
        ht = self.hidden_layer.eval(xt_stacked)
        dyt_dwo, dyt_dbo = self.output_layer.grad_parameters(ht)
        dyt_dht = self.output_layer.grad_input(ht)
        
        dht_dwh, dht_dbh = self.hidden_layer.grad_parameters(xt_stacked)
        dyt_dwh = np.einsum('mij,mjkl->mikl', dyt_dht, dht_dwh)
        dyt_dbh = np.einsum('mij,mjk->mik',  dyt_dht, dht_dbh)
        return dyt_dwo, dyt_dbo, dyt_dwh, dyt_dbh, ht
    
    def grad_input_t(self, xt, ht_1):
        """ computes dyt_dxt, dyt_dht_1"""
        xt_stacked = np.concatenate([xt,ht_1], axis=1)
        ht = self.hidden_layer.eval(xt_stacked)
        
        dyt_dht = self.output_layer.grad_input(ht)
        
        dht_dxt_stacked = self.hidden_layer.grad_input(xt_stacked)
        dht_dxt = dht_dxt_stacked[:,:,:self.input_units]
        dht_dht_1 = dht_dxt_stacked[:,:,self.input_units:]
        
        dyt_dxt_stacked = np.einsum('mij,mjk->mik', dyt_dht, dht_dxt_stacked)
        
        dyt_dxt = dyt_dxt_stacked[:,:,:self.input_units] 
        dyt_dht_1 = dyt_dxt_stacked[:,:,self.input_units:] 
        return dyt_dxt, dyt_dht_1, dht_dht_1, dht_dxt, ht
    
    def grad_input(self, X, start_sequence=None):
        """ computes dY_dX: grad of output wrt input for all timesteps/ cells """
        m, t = X.shape[0], X.shape[1]
        o_u, i_u = self.output_units, self.input_units
        
        ht_1 = np.zeros(( m, self.hidden_units))
        if start_sequence != None:
            ht_1 = start_sequence
            
        dY_dX = np.zeros((m, t, t, o_u, i_u))
        
        grad_across_time = {}
        for i in range(t):
            xt = X[:,i,:]
            
            dyt_dxt, dyt_dht_1, dht_dht_1, dht_dxt, ht_1 = self.grad_input_t(xt, ht_1)
            dY_dX[:,i,i] = dyt_dxt
            
            grad_across_time[i] = {}
            grad_across_time[i]['dht_dht_1'] = dht_dht_1
            grad_across_time[i]['dht_dxt'] = dht_dxt
            
            for j in range(i-1, -1, -1):
                dY_dX[:,i,j,:,:] = np.einsum('mij,mjk->mik', dyt_dht_1, grad_across_time[j]['dht_dxt'])
                dyt_dht_1 = np.einsum('mij,mjk->mik', dyt_dht_1, grad_across_time[j]['dht_dht_1'])
        return dY_dX

    def grad_parameters(self, X, start_sequence):
        ht_1 = np.zeros(( m, self.hidden_units))
        if start_sequence != None:
            ht_1 = start_sequence
        
        for i in range(self.timestep):
            xt = X[:,i,:]
            
            dyt_dwo, dyt_dbo, dyt_dwh, dyt_dbh = self.grad_parameters_t()
#             to be done
            
        
        return da_dw, da_db

    def backprop_grad(self, grad_loss, grad):
        dL_dwi = np.einsum('mij,mjkl->mikl', grad_loss, grad['w']).sum(axis=0)[0]
        dL_dbi = np.einsum('mij,mjk->mik', grad_loss, grad['b']).sum(axis=0)
        grad_loss = np.einsum('mij,mjk->mik', grad_loss, grad['input'])
        return dL_dwi, dL_dbi, grad_loss
        
    def gradient_dict(self, output):
        g = {}
        g['input'] = self.grad_input(output)
        g['w'], g['b'] = self.grad_parameters(output)
        return g

    def update(self, grad, optimizer):
        """ grad: (dL_dwi, dL_dbi)"""
        self.dot.W = optimizer.minimize(self.dot.W, grad[0])
        self.dot.b = optimizer.minimize(self.dot.b, grad[1])
        



In [56]:
X = np.random.randn(2,5,6) # m,t,f

In [58]:
X.shape[1:]

(5, 6)

In [91]:
layer = BasicRNN(input_size=X.shape[1:], output_activation=Sigmoid(), output_units=3, hidden_units=4)

In [93]:
layer.grad_input(X).shape

(2, 5, 5, 3, 6)

In [84]:
layer.eval(X).shape

(2, 5, 3)

In [85]:
layer.get_total_parameters()
# wh -> h_u + f, h_u -> (4+6, 4) -> 10,4 -> 40
# bh -> (1,h_u) - > (1,4) -> 4
# wo -> (h_u, o_u) -> (4,3) -> 12
# bo -> (1,o_u) -> (1,3) -> 3
# 40 + 4 + 12 + 3 = 59

59

In [86]:
layer.get_parameter_shape() # wh_shape, wo_shape

((10, 4), (4, 3))

In [14]:
def func(X):
    x = X.copy()
    x[x<0] = 0
    return x

In [15]:
ans = func(X)
print(X)

[[[0.         0.27948775 0.98570582 0.38136993 0.         0.        ]
  [0.40027991 0.01333856 0.3616865  0.27387359 1.565782   1.91191395]
  [1.53453606 0.         1.10038672 0.         0.         0.10829895]
  [0.         0.83602428 0.46200109 0.         0.51585997 0.52166674]
  [1.27018777 0.36442535 0.         0.         2.1994819  0.        ]]]


In [23]:
X

array([[[-1.44005063, -0.39597992,  0.8269271 , -0.46858854,
         -0.15871013,  1.2412698 ],
        [-0.34501907,  1.91476221,  0.14398673, -0.238387  ,
         -0.56056956,  0.70709831],
        [-0.03690487,  1.51497133,  0.99664874, -1.13692055,
         -0.54514179,  1.91404761],
        [-0.18410903,  1.11430618, -0.48743262,  1.61321692,
          0.85752629, -0.25828559],
        [-0.12655564, -0.48302907, -1.69446662,  1.13701279,
         -0.01144496, -2.26846502]]])

In [22]:
(X>=0)*X

array([[[-0.        , -0.        ,  0.8269271 , -0.        ,
         -0.        ,  1.2412698 ],
        [-0.        ,  1.91476221,  0.14398673, -0.        ,
         -0.        ,  0.70709831],
        [-0.        ,  1.51497133,  0.99664874, -0.        ,
         -0.        ,  1.91404761],
        [-0.        ,  1.11430618, -0.        ,  1.61321692,
          0.85752629, -0.        ],
        [-0.        , -0.        , -0.        ,  1.13701279,
         -0.        , -0.        ]]])