In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

The LSTM cell has several gates and memory units to control the flow of information. Let's denote:

- $ x_t $ as the input vector at time step $ t $,
- $ h_{t-1} $ as the previous hidden state (output) at time step $ t-1 $,
- $ c_{t-1} $ as the previous cell state at time step $ t-1 $,
- $ h_t $ as the current hidden state (output) at time step $ t $,
- $ c_t $ as the current cell state at time step $ t $.

The LSTM cell consists of the following components:

1. Forget Gate:
   - The forget gate decides what information to discard from the cell state.
   - It takes $ x_t $ and $ h_{t-1} $ as inputs and produces a forget gate activation vector $ f_t $ using a sigmoid activation function.
   - Mathematically, the forget gate is defined as:
     $ f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) $
     where $ W_f $ is the weight matrix and $ b_f $ is the bias vector for the forget gate.

2. Input Gate:
   - The input gate decides what new information to store in the cell state.
   - It takes $ x_t $ and $ h_{t-1} $ as inputs and produces an input gate activation vector $ i_t $ and a candidate cell state update vector $ \tilde{c}_t $ using sigmoid and tanh activation functions, respectively.
   - Mathematically, the input gate and candidate cell state update are defined as:
     $ i_t = \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) $
     $ \tilde{c}_t = \tanh(W_c \cdot [h_{t-1}, x_t] + b_c) $

3. Update Cell State:
   - The update cell state computes the new cell state by combining the previous cell state $ c_{t-1} $ with the information selected by the forget gate and the information to be added by the input gate.
   - Mathematically, the new cell state $ c_t $ is computed as:
     $ c_t = f_t \cdot c_{t-1} + i_t \cdot \tilde{c}_t $

4. Output Gate:
   - The output gate decides what information to output from the cell state.
   - It takes $ x_t $ and $ h_{t-1} $ as inputs and produces an output gate activation vector $ o_t $ and the next hidden state $ h_t $ using sigmoid and tanh activation functions, respectively.
   - Mathematically, the output gate and the next hidden state are defined as:
     $ o_t = \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) $
     $ h_t = o_t \cdot \tanh(c_t) $


In [1]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

import numpy as np

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Initialize weights and biases
        self.W_f = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_c = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_c = np.zeros((hidden_size, 1))
        
        self.W_o = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_o = np.zeros((hidden_size, 1))
        
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def forward(self, x_t, h_prev, c_prev):
        # Concatenate input and previous hidden state
        concat_input = np.vstack((h_prev, x_t))
        
        # Forget gate
        f_t = self.sigmoid(np.dot(self.W_f, concat_input) + self.b_f)
        
        # Input gate
        i_t = self.sigmoid(np.dot(self.W_i, concat_input) + self.b_i)
        
        # Candidate cell state update
        tilde_c_t = self.tanh(np.dot(self.W_c, concat_input) + self.b_c)
        
        # Update cell state
        c_t = f_t * c_prev + i_t * tilde_c_t
        
        # Output gate
        o_t = self.sigmoid(np.dot(self.W_o, concat_input) + self.b_o)
        
        # Update hidden state
        h_t = o_t * self.tanh(c_t)
        
        return h_t, c_t


In [1]:
import numpy as np

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Initialize weights and biases
        self.W_f = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_c = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_c = np.zeros((hidden_size, 1))
        
        self.W_o = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_o = np.zeros((hidden_size, 1))
        
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def forward(self, x_t, h_prev, c_prev):
        # Concatenate input and previous hidden state
        concat_input = np.vstack((h_prev, x_t))
        
        # Forget gate
        f_t = self.sigmoid(np.dot(self.W_f, concat_input) + self.b_f)
        
        # Input gate
        i_t = self.sigmoid(np.dot(self.W_i, concat_input) + self.b_i)
        
        # Candidate cell state update
        tilde_c_t = self.tanh(np.dot(self.W_c, concat_input) + self.b_c)
        
        # Update cell state
        c_t = f_t * c_prev + i_t * tilde_c_t
        
        # Output gate
        o_t = self.sigmoid(np.dot(self.W_o, concat_input) + self.b_o)
        
        # Update hidden state
        h_t = o_t * self.tanh(c_t)
        
        return h_t, c_t, f_t, i_t, tilde_c_t, o_t

class BidirectionalLSTM:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        
        self.lstm_forward = LSTMCell(input_size, hidden_size)
        self.lstm_backward = LSTMCell(input_size, hidden_size)
        
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))
    
    def forward(self, X):
        T = X.shape[1]
        
        h_forward = np.zeros((self.hidden_size, T))
        c_forward = np.zeros((self.hidden_size, T))
        f_forward = np.zeros((self.hidden_size, T))
        i_forward = np.zeros((self.hidden_size, T))
        tilde_c_forward = np.zeros((self.hidden_size, T))
        o_forward = np.zeros((self.hidden_size, T))
        
        h_backward = np.zeros((self.hidden_size, T))
        c_backward = np.zeros((self.hidden_size, T))
        f_backward = np.zeros((self.hidden_size, T))
        i_backward = np.zeros((self.hidden_size, T))
        tilde_c_backward = np.zeros((self.hidden_size, T))
        o_backward = np.zeros((self.hidden_size, T))
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Forward LSTM
        for t in range(T):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_forward.forward(x_t, h_t, c_t)
            h_forward[:, t] = h_t.ravel()
            c_forward[:, t] = c_t.ravel()
            f_forward[:, t] = f_t.ravel()
            i_forward[:, t] = i_t.ravel()
            tilde_c_forward[:, t] = tilde_c_t.ravel()
            o_forward[:, t] = o_t.ravel()
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Backward LSTM
        for t in reversed(range(T)):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_backward.forward(x_t, h_t, c_t)
            h_backward[:, t] = h_t.ravel()
            c_backward[:, t] = c_t.ravel()
            f_backward[:, t] = f_t.ravel()
            i_backward[:, t] = i_t.ravel()
            tilde_c_backward[:, t] = tilde_c_t.ravel()
            o_backward[:, t] = o_t.ravel()
        
        h = np.vstack((h_forward, h_backward))
        
        y = self.softmax(np.dot(self.W_y, h) + self.b_y)
        
        self.h_forward = h_forward
        self.c_forward = c_forward
        self.f_forward = f_forward
        self.i_forward = i_forward
        self.tilde_c_forward = tilde_c_forward
        self.o_forward = o_forward
        self.h_backward = h_backward
        self.c_backward = c_backward
        self.f_backward = f_backward
        self.i_backward = i_backward
        self.tilde_c_backward = tilde_c_backward
        self.o_backward = o_backward
        
        return y
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exp_x / np.sum(exp_x, axis=0, keepdims=True)
    
    def compute_loss(self, Y, Y_pred):
        return -np.sum(Y * np.log(Y_pred))
    
    def backward(self, X, Y, Y_pred, lr=0.001):
        T = X.shape[1]
        
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)
        
        dh_forward = np.zeros_like(self.h_forward)
        dh_backward = np.zeros_like(self.h_backward)
        
        dc_forward = np.zeros_like(self.c_forward)
        dc_backward = np.zeros_like(self.c_backward)
        
        for t in range(T):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            dW_y += np.dot(dy, self.h_forward[:, t].reshape(1, -1))
            db_y += dy
        
        # Backpropagation through time for forward LSTM
        for t in reversed(range(T)):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            do = dy * np.tanh(self.c_forward[:, t].reshape(-1, 1))
            dc = do * self.o_forward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_forward[:, t].reshape(-1, 1)) ** 2)
            dc += dc_forward[:, t].reshape(-1, 1)
            
            di = dc * self.tilde_c_forward[:, t].reshape(-1, 1)
            df = dc * self.c_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros_like(dc)
            dtilde_c = dc * self.i_forward[:, t].reshape(-1, 1)
            
            di_input = di * self.i_forward[:, t].reshape(-1, 1) * (1 - self.i_forward[:, t].reshape(-1, 1))
            df_input = df * self.f_forward[:, t].reshape(-1, 1) * (1 - self.f_forward[:, t].reshape(-1, 1))
            do_input = do * self.o_forward[:, t].reshape(-1, 1) * (1 - self.o_forward[:, t].reshape(-1, 1))
            dtilde_c_input = dtilde_c * (1 - self.tilde_c_forward[:, t].reshape(-1, 1) ** 2)
            
            dW_i = np.dot(di_input, np.vstack((self.h_forward[:, t-1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_f = np.dot(df_input, np.vstack((self.h_forward[:, t-1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_o = np.dot(do_input, np.vstack((self.h_forward[:, t-1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_c = np.dot(dtilde_c_input, np.vstack((self.h_forward[:, t-1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            
            dc_forward[:, t-1] = dc * self.f_forward[:, t].reshape(-1, 1) if t > 0 else np.zeros_like(dc)
            dh_forward[:, t-1] = (np.dot(self.lstm_forward.W_i.T, di_input) +
                                  np.dot(self.lstm_forward.W_f.T, df_input) +
                                  np.dot(self.lstm_forward.W_o.T, do_input) +
                                  np.dot(self.lstm_forward.W_c.T, dtilde_c_input))[:self.hidden_size]
        
        # Backpropagation through time for backward LSTM
        for t in range(T):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            do = dy * np.tanh(self.c_backward[:, t].reshape(-1, 1))
            dc = do * self.o_backward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_backward[:, t].reshape(-1, 1)) ** 2)
            dc += dc_backward[:, t].reshape(-1, 1)
            
            di = dc * self.tilde_c_backward[:, t].reshape(-1, 1)
            df = dc * self.c_backward[:, t+1].reshape(-1, 1) if t < T-1 else np.zeros_like(dc)
            dtilde_c = dc * self.i_backward[:, t].reshape(-1, 1)
            
            di_input = di * self.i_backward[:, t].reshape(-1, 1) * (1 - self.i_backward[:, t].reshape(-1, 1))
            df_input = df * self.f_backward[:, t].reshape(-1, 1) * (1 - self.f_backward[:, t].reshape(-1, 1))
            do_input = do * self.o_backward[:, t].reshape(-1, 1) * (1 - self.o_backward[:, t].reshape(-1, 1))
            dtilde_c_input = dtilde_c * (1 - self.tilde_c_backward[:, t].reshape(-1, 1) ** 2)
            
            dW_i = np.dot(di_input, np.vstack((self.h_backward[:, t+1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_f = np.dot(df_input, np.vstack((self.h_backward[:, t+1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_o = np.dot(do_input, np.vstack((self.h_backward[:, t+1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            dW_c = np.dot(dtilde_c_input, np.vstack((self.h_backward[:, t+1].reshape(-1, 1), X[:, t].reshape(-1, 1))).T)
            
            dc_backward[:, t+1] = dc * self.f_backward[:, t].reshape(-1, 1) if t < T-1 else np.zeros_like(dc)
            dh_backward[:, t+1] = (np.dot(self.lstm_backward.W_i.T, di_input) +
                                   np.dot(self.lstm_backward.W_f.T, df_input) +
                                   np.dot(self.lstm_backward.W_o.T, do_input) +
                                   np.dot(self.lstm_backward.W_c.T, dtilde_c_input))[:self.hidden_size]
        
        self.W_y -= lr * dW_y
        self.b_y -= lr * db_y
    
    def train(self, X_train, Y_train, epochs=100, lr=0.001):
        for epoch in range(epochs):
            Y_pred = self.forward(X_train)
            loss = self.compute_loss(Y_train, Y_pred)
            self.backward(X_train, Y_train, Y_pred, lr)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss}")

# Example usage:
input_size = 9  # Each image is 3x3, flattened to a vector of size 9
hidden_size = 10
output_size = 2  # Two classes: 0 and 1

# Create a BidirectionalLSTM instance
model = BidirectionalLSTM(input_size, hidden_size, output_size)

# Sample input data (2 samples of 3x3 images flattened to 9xT vectors, with T=1)
X_train = np.array([
    [1, 1, 1, 1, 0, 1, 1, 1, 1],  # Digit '0'
    [0, 1, 0, 0, 1, 0, 0, 1, 0]   # Digit '1'
]).T

# Sample target output (one-hot encoded)
Y_train = np.array([
    [1, 0],  # Label for '0'
    [0, 1]   # Label for '1'
]).T

# Train the model
model.train(X_train, Y_train, epochs=1000, lr=0.01)


ValueError: operands could not be broadcast together with shapes (2,20) (2,10) (2,20) 

In [2]:
import numpy as np

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Initialize weights and biases
        self.W_f = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_c = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_c = np.zeros((hidden_size, 1))
        
        self.W_o = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_o = np.zeros((hidden_size, 1))
        
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def forward(self, x_t, h_prev, c_prev):
        # Concatenate input and previous hidden state
        concat_input = np.vstack((h_prev, x_t))
        
        # Forget gate
        f_t = self.sigmoid(np.dot(self.W_f, concat_input) + self.b_f)
        
        # Input gate
        i_t = self.sigmoid(np.dot(self.W_i, concat_input) + self.b_i)
        
        # Candidate cell state update
        tilde_c_t = self.tanh(np.dot(self.W_c, concat_input) + self.b_c)
        
        # Update cell state
        c_t = f_t * c_prev + i_t * tilde_c_t
        
        # Output gate
        o_t = self.sigmoid(np.dot(self.W_o, concat_input) + self.b_o)
        
        # Update hidden state
        h_t = o_t * self.tanh(c_t)
        
        return h_t, c_t, f_t, i_t, tilde_c_t, o_t

class BidirectionalLSTM:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        
        self.lstm_forward = LSTMCell(input_size, hidden_size)
        self.lstm_backward = LSTMCell(input_size, hidden_size)
        
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))
    
    def forward(self, X):
        T = X.shape[1]
        
        h_forward = np.zeros((self.hidden_size, T))
        c_forward = np.zeros((self.hidden_size, T))
        f_forward = np.zeros((self.hidden_size, T))
        i_forward = np.zeros((self.hidden_size, T))
        tilde_c_forward = np.zeros((self.hidden_size, T))
        o_forward = np.zeros((self.hidden_size, T))
        
        h_backward = np.zeros((self.hidden_size, T))
        c_backward = np.zeros((self.hidden_size, T))
        f_backward = np.zeros((self.hidden_size, T))
        i_backward = np.zeros((self.hidden_size, T))
        tilde_c_backward = np.zeros((self.hidden_size, T))
        o_backward = np.zeros((self.hidden_size, T))
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Forward LSTM
        for t in range(T):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_forward.forward(x_t, h_t, c_t)
            h_forward[:, t] = h_t.ravel()
            c_forward[:, t] = c_t.ravel()
            f_forward[:, t] = f_t.ravel()
            i_forward[:, t] = i_t.ravel()
            tilde_c_forward[:, t] = tilde_c_t.ravel()
            o_forward[:, t] = o_t.ravel()
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Backward LSTM
        for t in reversed(range(T)):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_backward.forward(x_t, h_t, c_t)
            h_backward[:, t] = h_t.ravel()
            c_backward[:, t] = c_t.ravel()
            f_backward[:, t] = f_t.ravel()
            i_backward[:, t] = i_t.ravel()
            tilde_c_backward[:, t] = tilde_c_t.ravel()
            o_backward[:, t] = o_t.ravel()
        
        h = np.vstack((h_forward, h_backward))
        
        y = self.softmax(np.dot(self.W_y, h) + self.b_y)
        
        self.h_forward = h_forward
        self.c_forward = c_forward
        self.f_forward = f_forward
        self.i_forward = i_forward
        self.tilde_c_forward = tilde_c_forward
        self.o_forward = o_forward
        self.h_backward = h_backward
        self.c_backward = c_backward
        self.f_backward = f_backward
        self.i_backward = i_backward
        self.tilde_c_backward = tilde_c_backward
        self.o_backward = o_backward
        
        return y
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exp_x / np.sum(exp_x, axis=0, keepdims=True)
    
    def compute_loss(self, Y, Y_pred):
        return -np.sum(Y * np.log(Y_pred))
    
    def backward(self, X, Y, Y_pred, lr=0.001):
        T = X.shape[1]
        
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)
        
        dh_forward = np.zeros((self.hidden_size, T))
        dh_backward = np.zeros((self.hidden_size, T))
        
        dc_forward = np.zeros((self.hidden_size, T))
        dc_backward = np.zeros((self.hidden_size, T))
        
        for t in range(T):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            dW_y += np.dot(dy, np.vstack((self.h_forward[:, t], self.h_backward[:, t])).T)
            db_y += dy
        
        # Backpropagation through time for forward LSTM
        for t in reversed(range(T)):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            do = dy * np.tanh(self.c_forward[:, t].reshape(-1, 1))
            dc = do * self.o_forward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_forward[:, t].reshape(-1, 1)) ** 2)
            dc += dc_forward[:, t].reshape(-1, 1)
            
            di = dc * self.tilde_c_forward[:, t].reshape(-1, 1)
            df = dc * self.c_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros_like(dc)
            dtilde_c = dc * self.i_forward[:, t].reshape(-1, 1)
            
            di_input = di * self.i_forward[:, t].reshape(-1, 1) * (1 - self.i_forward[:, t].reshape(-1, 1))
            df_input = df * self.f_forward[:, t].reshape(-1, 1) * (1 - self.f_forward[:, t].reshape(-1, 1))
            do_input = do * self.o_forward[:, t].reshape(-1, 1) * (1 - self.o_forward[:, t].reshape(-1, 1))
            dtilde_c_input = dtilde_c * (1 - self.tilde_c_forward[:, t].reshape(-1, 1) ** 2)
            
            concat_input = np.vstack((self.h_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros((self.hidden_size, 1)), X[:, t].reshape(-1, 1)))
            
            dW_i = np.dot(di_input, concat_input.T)
            dW_f = np.dot(df_input, concat_input.T)
            dW_o = np.dot(do_input, concat_input.T)
            dW_c = np.dot(dtilde_c_input, concat_input.T)
            
            dc_forward[:, t-1] = dc * self.f_forward[:, t].reshape(-1, 1) if t > 0 else np.zeros_like(dc)
            dh_forward[:, t-1] = (np.dot(self.lstm_forward.W_i.T, di_input) +
                                  np.dot(self.lstm_forward.W_f.T, df_input) +
                                  np.dot(self.lstm_forward.W_o.T, do_input) +
                                  np.dot(self.lstm_forward.W_c.T, dtilde_c_input))[:self.hidden_size]
        
        # Backpropagation through time for backward LSTM
        for t in range(T):
            dy = Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)
            do = dy * np.tanh(self.c_backward[:, t].reshape(-1, 1))
            dc = do * self.o_backward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_backward[:, t].reshape(-1, 1)) ** 2)
            dc += dc_backward[:, t].reshape(-1, 1)
            
            di = dc * self.tilde_c_backward[:, t].reshape(-1, 1)
            df = dc * self.c_backward[:, t+1].reshape(-1, 1) if t < T-1 else np.zeros_like(dc)
            dtilde_c = dc * self.i_backward[:, t].reshape(-1, 1)
            
            di_input = di * self.i_backward[:, t].reshape(-1, 1) * (1 - self.i_backward[:, t].reshape(-1, 1))
            df_input = df * self.f_backward[:, t].reshape(-1, 1) * (1 - self.f_backward[:, t].reshape(-1, 1))
            do_input = do * self.o_backward[:, t].reshape(-1, 1) * (1 - self.o_backward[:, t].reshape(-1, 1))
            dtilde_c_input = dtilde_c * (1 - self.tilde_c_backward[:, t].reshape(-1, 1) ** 2)
            
            concat_input = np.vstack((self.h_backward[:, t+1].reshape(-1, 1) if t < T-1 else np.zeros((self.hidden_size, 1)), X[:, t].reshape(-1, 1)))
            
            dW_i = np.dot(di_input, concat_input.T)
            dW_f = np.dot(df_input, concat_input.T)
            dW_o = np.dot(do_input, concat_input.T)
            dW_c = np.dot(dtilde_c_input, concat_input.T)
            
            dc_backward[:, t+1] = dc * self.f_backward[:, t].reshape(-1, 1) if t < T-1 else np.zeros_like(dc)
            dh_backward[:, t+1] = (np.dot(self.lstm_backward.W_i.T, di_input) +
                                   np.dot(self.lstm_backward.W_f.T, df_input) +
                                   np.dot(self.lstm_backward.W_o.T, do_input) +
                                   np.dot(self.lstm_backward.W_c.T, dtilde_c_input))[:self.hidden_size]
        
        self.W_y -= lr * dW_y
        self.b_y -= lr * db_y
    
    def train(self, X_train, Y_train, epochs=100, lr=0.001):
        for epoch in range(epochs):
            Y_pred = self.forward(X_train)
            loss = self.compute_loss(Y_train, Y_pred)
            self.backward(X_train, Y_train, Y_pred, lr)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss}")

# Example usage:
input_size = 9  # Each image is 3x3, flattened to a vector of size 9
hidden_size = 10
output_size = 2  # Two classes: 0 and 1

# Create a BidirectionalLSTM instance
model = BidirectionalLSTM(input_size, hidden_size, output_size)

# Sample input data (2 samples of 3x3 images flattened to 9xT vectors, with T=1)
X_train = np.array([
    [1, 1, 1, 1, 0, 1, 1, 1, 1],  # Digit '0'
    [0, 1, 0, 0, 1, 0, 0, 1, 0]   # Digit '1'
]).T

# Sample target output (one-hot encoded)
Y_train = np.array([
    [1, 0],  # Label for '0'
    [0, 1]   # Label for '1'
]).T

# Train the model
model.train(X_train, Y_train, epochs=1000, lr=0.01)


ValueError: shapes (2,1) and (10,2) not aligned: 1 (dim 1) != 10 (dim 0)

In [6]:
import numpy as np

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Initialize weights and biases
        self.W_f = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_c = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_c = np.zeros((hidden_size, 1))
        
        self.W_o = np.random.randn(hidden_size, input_size + hidden_size)
        self.b_o = np.zeros((hidden_size, 1))
        
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def tanh(self, x):
        return np.tanh(x)
    
    def forward(self, x_t, h_prev, c_prev):
        # Concatenate input and previous hidden state
        concat_input = np.vstack((h_prev, x_t))
        
        # Forget gate
        f_t = self.sigmoid(np.dot(self.W_f, concat_input) + self.b_f)
        
        # Input gate
        i_t = self.sigmoid(np.dot(self.W_i, concat_input) + self.b_i)
        
        # Candidate cell state update
        tilde_c_t = self.tanh(np.dot(self.W_c, concat_input) + self.b_c)
        
        # Update cell state
        c_t = f_t * c_prev + i_t * tilde_c_t
        
        # Output gate
        o_t = self.sigmoid(np.dot(self.W_o, concat_input) + self.b_o)
        
        # Update hidden state
        h_t = o_t * self.tanh(c_t)
        
        return h_t, c_t, f_t, i_t, tilde_c_t, o_t

class BidirectionalLSTM:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        
        self.lstm_forward = LSTMCell(input_size, hidden_size)
        self.lstm_backward = LSTMCell(input_size, hidden_size)
        
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))
    
    def forward(self, X):
        T = X.shape[1]
        
        h_forward = np.zeros((self.hidden_size, T))
        c_forward = np.zeros((self.hidden_size, T))
        f_forward = np.zeros((self.hidden_size, T))
        i_forward = np.zeros((self.hidden_size, T))
        tilde_c_forward = np.zeros((self.hidden_size, T))
        o_forward = np.zeros((self.hidden_size, T))
        
        h_backward = np.zeros((self.hidden_size, T))
        c_backward = np.zeros((self.hidden_size, T))
        f_backward = np.zeros((self.hidden_size, T))
        i_backward = np.zeros((self.hidden_size, T))
        tilde_c_backward = np.zeros((self.hidden_size, T))
        o_backward = np.zeros((self.hidden_size, T))
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Forward LSTM
        for t in range(T):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_forward.forward(x_t, h_t, c_t)
            h_forward[:, t] = h_t.ravel()
            c_forward[:, t] = c_t.ravel()
            f_forward[:, t] = f_t.ravel()
            i_forward[:, t] = i_t.ravel()
            tilde_c_forward[:, t] = tilde_c_t.ravel()
            o_forward[:, t] = o_t.ravel()
        
        h_t = np.zeros((self.hidden_size, 1))
        c_t = np.zeros((self.hidden_size, 1))
        
        # Backward LSTM
        for t in reversed(range(T)):
            x_t = X[:, t].reshape(-1, 1)
            h_t, c_t, f_t, i_t, tilde_c_t, o_t = self.lstm_backward.forward(x_t, h_t, c_t)
            h_backward[:, t] = h_t.ravel()
            c_backward[:, t] = c_t.ravel()
            f_backward[:, t] = f_t.ravel()
            i_backward[:, t] = i_t.ravel()
            tilde_c_backward[:, t] = tilde_c_t.ravel()
            o_backward[:, t] = o_t.ravel()
        
        h = np.vstack((h_forward, h_backward))
        
        y = self.softmax(np.dot(self.W_y, h) + self.b_y)
        
        self.h_forward = h_forward
        self.c_forward = c_forward
        self.f_forward = f_forward
        self.i_forward = i_forward
        self.tilde_c_forward = tilde_c_forward
        self.o_forward = o_forward
        self.h_backward = h_backward
        self.c_backward = c_backward
        self.f_backward = f_backward
        self.i_backward = i_backward
        self.tilde_c_backward = tilde_c_backward
        self.o_backward = o_backward
        
        return y
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exp_x / np.sum(exp_x, axis=0, keepdims=True)
    
    def compute_loss(self, Y, Y_pred):
        return -np.sum(Y * np.log(Y_pred))
    
    def backward(self, X, Y, Y_pred, lr=0.001):
        T = X.shape[1]
        
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)
        
        dh_forward = np.zeros((self.hidden_size, T))
        dh_backward = np.zeros((self.hidden_size, T))
        
        dc_forward = np.zeros((self.hidden_size, T))
        dc_backward = np.zeros((self.hidden_size, T))
       
        for t in range(T):
            #print((self.h_forward[:, t]).shape)
            #print((self.h_backward[:, t]).shape)
           
            dy = (Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1))
            #print(dW_y.shape) # (2,20)
            #print(np.vstack((self.h_forward[:, t], self.h_backward[:, t])).T)
            #dW_y += np.dot(dy.T, np.vstack((self.h_forward[:, t], self.h_backward[:, t])))
            
           # Concatenate forward and backward hidden states along the columns
        
            h_concat = np.hstack((self.h_forward[:, t], self.h_backward[:, t]))
# Transpose h_concat to align dimensions properly
            h_concat_transposed = h_concat.T.reshape(1, -1)  # Shape: (1, 20)
# Now shape is (20, 2)

# Perform dot product with dy
            dW_y += np.dot(dy, h_concat_transposed)  
            
            db_y += dy
        
        # Backpropagation through time for forward LSTM
        for t in reversed(range(T)):
            dy = (Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1)) # (2,1) and (10,1)
            # Reshape dy to match the shape of self.c_forward[:, t]
            # dy_broadcasted = np.broadcast_to(dy, self.c_forward[:, t].shape)  # Assuming self.c_forward[:, t].shape is (10, 1)

# Perform element-wise multiplication
            # do = dy_broadcasted * np.tanh(self.c_forward[:, t].reshape(-1, 1).T)
            
            do = dy * np.tanh(self.c_forward[:, t].reshape(-1, 1))
            
            dc = do * self.o_forward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_forward[:, t].reshape(-1, 1)) ** 2)
            dc += dc_forward[:, t].reshape(-1, 1)
            
            di = dc * self.tilde_c_forward[:, t].reshape(-1, 1)
            df = dc * self.c_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros_like(dc)
            dtilde_c = dc * self.i_forward[:, t].reshape(-1, 1)
            
            di_input = di * self.i_forward[:, t].reshape(-1, 1) * (1 - self.i_forward[:, t].reshape(-1, 1))
            df_input = df * self.f_forward[:, t].reshape(-1, 1) * (1 - self.f_forward[:, t].reshape(-1, 1))
            do_input = do * self.o_forward[:, t].reshape(-1, 1) * (1 - self.o_forward[:, t].reshape(-1, 1))
            dtilde_c_input = dtilde_c * (1 - self.tilde_c_forward[:, t].reshape(-1, 1) ** 2)
            
            concat_input = np.vstack((self.h_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros((self.hidden_size, 1)), X[:, t].reshape(-1, 1)))
            
            self.lstm_forward.W_i -= lr * np.dot(di_input, concat_input.T)
            self.lstm_forward.W_f -= lr * np.dot(df_input, concat_input.T)
            self.lstm_forward.W_o -= lr * np.dot(do_input, concat_input.T)
            self.lstm_forward.W_c -= lr * np.dot(dtilde_c_input, concat_input.T)
            
            dc_forward[:, t-1] = (dc * self.f_forward[:, t].reshape(-1, 1)).reshape(-1) if t > 0 else np.zeros_like(dc)
            dh_forward[:, t-1] = (np.dot(self.lstm_forward.W_i.T, di_input) +
                                  np.dot(self.lstm_forward.W_f.T, df_input) +
                                  np.dot(self.lstm_forward.W_o.T, do_input) +
                                  np.dot(self.lstm_forward.W_c.T, dtilde_c_input))[:self.hidden_size].reshape(-1) if t > 0 else np.zeros_like(dc).reshape(-1)
        
        # Backpropagation through time for backward LSTM
        for t in range(T):
#             dy = (Y_pred[:, t].reshape(-1, 1) - Y[:, t].reshape(-1, 1))
#             do = dy * np.tanh(self.c_backward[:, t].reshape(-1, 1))
#             dc = do * self.o_backward[:, t].reshape(-1, 1) * (1 - np.tanh(self.c_backward[:, t].reshape(-1, 1)) ** 2)
#             dc += dc_backward[:, t].reshape(-1, 1)
            
#             di = dc * self.tilde_c_backward[:, t].reshape(-1, 1)
#             df = dc * self.c_backward[:, t+1].reshape(-1, 1) if t < T-1 else np.zeros_like(dc)
#             dtilde_c = dc * self.i_backward[:, t].reshape(-1, 1)
            
#             di_input = di * self.i_backward[:, t].reshape(-1, 1) * (1 - self.i_backward[:, t].reshape(-1, 1))
#             df_input = df * self.f_backward[:, t].reshape(-1, 1) * (1 - self.f_backward[:, t].reshape(-1, 1))
#             do_input = do * self.o_backward[:, t].reshape(-1, 1) * (1 - self.o_backward[:, t].reshape(-1, 1))
#             dtilde_c_input = dtilde_c * (1 - self.tilde_c_backward[:, t].reshape(-1, 1) ** 2)
            
#             concat_input = np.vstack((self.h_backward[:, t+1].reshape(-1, 1) if t < T-1 else np.zeros((self.hidden_size, 1)), X[:, t].reshape(-1, 1)))
            
#             self.lstm_backward.W_i -= lr * np.dot(di_input, concat_input.T)
#             self.lstm_backward.W_f -= lr * np.dot(df_input, concat_input.T)
#             self.lstm_backward.W_o -= lr * np.dot(do_input, concat_input.T)
#             self.lstm_backward.W_c -= lr * np.dot(dtilde_c_input, concat_input.T)
            
#             dc_backward[:, t+1] = (dc * self.f_backward[:, t].reshape(-1, 1)).reshape(-1) if t < T-1 else np.zeros_like(dc)
#             dh_backward[:, t+1] = (np.dot(self.lstm_backward.W_i.T, di_input) +
#                                    np.dot(self.lstm_backward.W_f.T, df_input) +
#                                    np.dot(self.lstm_backward.W_o.T, do_input) +
#                                    np.dot(self.lstm_backward.W_c.T, dtilde_c_input))[:self.hidden_size].reshape(-1) if t < T-1 else np.zeros_like(dc).reshape(-1)
        
#         self.W_y -= lr * dW_y
#         self.b_y -= lr * db_y
           # Backprop through time for forward direction
            dh_forward = np.dot(self.W_hy[:, :self.hidden_size].T, dy[:, t].reshape(-1, 1)) + dh_forward
            dtanh_forward = tanh_derivative(self.h_forward[:, t].reshape(-1, 1))
            dW_xh += np.dot(dh_forward * dtanh_forward, X[:, t].reshape(1, -1))
            db_h += dh_forward * dtanh_forward
            if t > 0:
                dW_hh += np.dot(dh_forward * dtanh_forward, self.h_forward[:, t-1].reshape(1, -1))
            dh_forward = np.dot(self.W_hh.T, dh_forward * dtanh_forward)

            # Backprop through time for backward direction
            dh_backward = np.dot(self.W_hy[:, self.hidden_size:].T, dy[:, T-t-1].reshape(-1, 1)) + dh_backward
            dtanh_backward = tanh_derivative(self.h_backward[:, T-t-1].reshape(-1, 1))
            dW_xh += np.dot(dh_backward * dtanh_backward, X[:, T-t-1].reshape(1, -1))
            db_h += dh_backward * dtanh_backward
            if T-t-1 < T-1:
                dW_hh += np.dot(dh_backward * dtanh_backward, self.h_backward[:, T-t].reshape(1, -1))
            dh_backward = np.dot(self.W_hh.T, dh_backward * dtanh_backward)

            # Update output layer weights
            h_concat_transposed = np.vstack((self.h_forward[:, t], self.h_backward[:, t])).reshape(1, -1)
            dW_hy += np.dot(dy[:, t].reshape(-1, 1), h_concat_transposed)

        # Update parameters
        self.W_xh -= lr * dW_xh
        self.W_hh -= lr * dW_hh
        self.W_hy -= lr * dW_hy
        self.b_h -= lr * db_h
        self.b_y -= lr * db_y
def backward(self, X, y_true, y_pred, lr):
    T = X.shape[1]
    dy = y_pred - y_true  # Assuming y_true is the target output

    dW_xh = np.zeros_like(self.W_xh)
            dW_hh = np.zeros_like(self.W_hh)
            dW_hy = np.zeros_like(self.W_hy)
            db_h = np.zeros_like(self.b_h)
            db_y = np.zeros_like(self.b_y)

            dh_forward = np.zeros((self.hidden_size, 1))
            dh_backward = np.zeros((self.hidden_size, 1))

            for t in range(T):
                # Backprop through time for forward direction
                dh_forward = np.dot(self.W_hy[:, :self.hidden_size].T, dy[:, t].reshape(-1, 1)) + dh_forward
                dtanh_forward = tanh_derivative(self.h_forward[:, t].reshape(-1, 1))
                dW_xh += np.dot(dh_forward * dtanh_forward, X[:, t].reshape(1, -1))
                db_h += dh_forward * dtanh_forward
                if t > 0:
                    dW_hh += np.dot(dh_forward * dtanh_forward, self.h_forward[:, t-1].reshape(1, -1))
                dh_forward = np.dot(self.W_hh.T, dh_forward * dtanh_forward)

                # Backprop through time for backward direction
                dh_backward = np.dot(self.W_hy[:, self.hidden_size:].T, dy[:, T-t-1].reshape(-1, 1)) + dh_backward
                dtanh_backward = tanh_derivative(self.h_backward[:, T-t-1].reshape(-1, 1))
                dW_xh += np.dot(dh_backward * dtanh_backward, X[:, T-t-1].reshape(1, -1))
                db_h += dh_backward * dtanh_backward
                if T-t-1 < T-1:
                    dW_hh += np.dot(dh_backward * dtanh_backward, self.h_backward[:, T-t].reshape(1, -1))
                dh_backward = np.dot(self.W_hh.T, dh_backward * dtanh_backward)

                # Update output layer weights
                h_concat_transposed = np.vstack((self.h_forward[:, t], self.h_backward[:, t])).reshape(1, -1)
                dW_hy += np.dot(dy[:, t].reshape(-1, 1), h_concat_transposed)

            # Update parameters
            self.W_xh -= lr * dW_xh
            self.W_hh -= lr * dW_hh
            self.W_hy -= lr * dW_hy
            self.b_h -= lr * db_h
            self.b_y -= lr * db_y

def train(self, X_train, Y_train, epochs=100, lr=0.001):
    for epoch in range(epochs):
        Y_pred = self.forward(X_train)
        loss = self.compute_loss(Y_train, Y_pred)
        self.backward(X_train, Y_train, Y_pred, lr)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss}")

# Example usage:
input_size = 9  # Each image is 3x3, flattened to a vector of size 9
hidden_size = 10
output_size = 2  # Two classes: 0 and 1

# Create a BidirectionalLSTM instance
model = BidirectionalLSTM(input_size, hidden_size, output_size)

# Sample input data (2 samples of 3x3 images flattened to 9xT vectors, with T=1)
X_train = np.array([
    [1, 1, 1, 1, 0, 1, 1, 1, 1],  # Digit '0'
    [0, 1, 0, 0, 1, 0, 0, 1, 0]   # Digit '1'
]).T

# Sample target output (one-hot encoded)
Y_train = np.array([
    [1, 0],  # Label for '0'
    [0, 1]   # Label for '1'
]).T

# Train the model
# print(self.h_forward[:, t])

model.train(X_train, Y_train, epochs=1000, lr=0.01)


ValueError: operands could not be broadcast together with shapes (2,1) (10,1) 

In [1]:
import numpy as np

# Example matrices
A = np.random.randn(2, 1)  # Shape (2, 1)
B = np.random.randn(10, 2) # Shape (10, 2)

# Transpose A to align dimensions correctly
C = np.dot(A.T, B.T)  # Dot product of shape (1, 2) and (10, 2)

print("Shape of A:", A.shape)
print("Shape of B:", B.shape)
print("Shape of C:", C.shape)
print("Resulting matrix C:\n", C)


Shape of A: (2, 1)
Shape of B: (10, 2)
Shape of C: (1, 10)
Resulting matrix C:
 [[ 0.06273332  0.47446912  3.60165141 -0.45661903 -1.55865188 -1.02723517
   0.88714056  1.90474578  1.27699197 -1.26093308]]


In [5]:
import numpy as np

# Activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

# Bidirectional RNN with LSTM
class BidirectionalRNN:
    def __init__(self, input_size, hidden_size, output_size, sequence_length):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.sequence_length = sequence_length

        # Initialize weights and biases
        self.W_f = np.random.randn(hidden_size, input_size)
        self.W_i = np.random.randn(hidden_size, input_size)
        self.W_c = np.random.randn(hidden_size, input_size)
        self.W_o = np.random.randn(hidden_size, input_size)
        
        self.U_f = np.random.randn(hidden_size, hidden_size)
        self.U_i = np.random.randn(hidden_size, hidden_size)
        self.U_c = np.random.randn(hidden_size, hidden_size)
        self.U_o = np.random.randn(hidden_size, hidden_size)
        
        self.b_f = np.zeros((hidden_size, 1))
        self.b_i = np.zeros((hidden_size, 1))
        self.b_c = np.zeros((hidden_size, 1))
        self.b_o = np.zeros((hidden_size, 1))
        
        self.V_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def forward(self, X):
        h_forward = np.zeros((self.hidden_size, self.sequence_length))
        c_forward = np.zeros((self.hidden_size, self.sequence_length))
        
        h_backward = np.zeros((self.hidden_size, self.sequence_length))
        c_backward = np.zeros((self.hidden_size, self.sequence_length))
        
        for t in range(self.sequence_length):
            x_t = X[:, t].reshape(-1, 1)
            h_prev_forward = h_forward[:, t-1].reshape(-1, 1) if t > 0 else np.zeros((self.hidden_size, 1))

            f_t = sigmoid(np.dot(self.W_f, x_t) + np.dot(self.U_f, h_prev_forward) + self.b_f)
            i_t = sigmoid(np.dot(self.W_i, x_t) + np.dot(self.U_i, h_prev_forward) + self.b_i)
            o_t = sigmoid(np.dot(self.W_o, x_t) + np.dot(self.U_o, h_prev_forward) + self.b_o)
            c_hat_t = tanh(np.dot(self.W_c, x_t) + np.dot(self.U_c, h_prev_forward) + self.b_c)
            t1 = c_forward[:,t].reshape(-1)
            print(t1.shape)
            #print((f_t * c_forward[:, t-1].reshape(-1,1))
            t2 = f_t * c_forward[:, t-1].reshape(-1,1)
            print(t2.shape)      
            t1[:] = np.squeeze(t2)
            print(t1.shape)
            c_forward[:, t] = t1
            #c_forward[:, t] = (f_t * c_forward[:, t-1].reshape(-1) if t > 0 else f_t) + i_t * c_hat_t
            #h_forward[:, :t] = o_t * tanh(c_forward[:, t])

        for t in range(self.sequence_length - 1, -1, -1):
            x_t = X[:, t].reshape(-1, 1)
            h_prev_backward = h_backward[:, t+1].reshape(-1, 1) if t < self.sequence_length - 1 else np.zeros((self.hidden_size, 1))

            f_t = sigmoid(np.dot(self.W_f, x_t) + np.dot(self.U_f, h_prev_backward) + self.b_f)
            i_t = sigmoid(np.dot(self.W_i, x_t) + np.dot(self.U_i, h_prev_backward) + self.b_i)
            o_t = sigmoid(np.dot(self.W_o, x_t) + np.dot(self.U_o, h_prev_backward) + self.b_o)
            c_hat_t = tanh(np.dot(self.W_c, x_t) + np.dot(self.U_c, h_prev_backward) + self.b_c)
            print(c_backward[:, t].shape)
            t_t = (f_t * c_backward[:, t+1].reshape(-1, 1) if t < self.sequence_length - 1 else f_t) + i_t * c_hat_t
            c_backward[:, t] = t_t.flatten()
            t2 = o_t * tanh(c_backward[:, t])
            t2_reshaped = t2.reshape(-1, 1)[:10].flatten()  # Ensure it becomes (10,)
            h_backward[:, t] = t2_reshaped
            #h_backward[:, t] = o_t * tanh(c_backward[:, t])
 

        h_concat = np.concatenate((h_forward[:, -1], h_backward[:, 0])).reshape(-1, 1)
        y_pred = sigmoid(np.dot(self.V_y, h_concat) + self.b_y)
        return y_pred, h_forward, h_backward, c_forward, c_backward

    def backward(self, X, y, y_pred, h_forward, h_backward, c_forward, c_backward, learning_rate):
        dV_y = np.zeros_like(self.V_y)
        db_y = np.zeros_like(self.b_y)
        
        dW_f = np.zeros_like(self.W_f)
        dW_i = np.zeros_like(self.W_i)
        dW_c = np.zeros_like(self.W_c)
        dW_o = np.zeros_like(self.W_o)
        
        dU_f = np.zeros_like(self.U_f)
        dU_i = np.zeros_like(self.U_i)
        dU_c = np.zeros_like(self.U_c)
        dU_o = np.zeros_like(self.U_o)
        
        db_f = np.zeros_like(self.b_f)
        db_i = np.zeros_like(self.b_i)
        db_c = np.zeros_like(self.b_c)
        db_o = np.zeros_like(self.b_o)

        dy = y_pred - y.reshape(-1, 1)
        dV_y += np.dot(dy, np.concatenate((h_forward[:, -1], h_backward[:, 0])).reshape(1, -1))
        db_y += dy

        dh_forward = np.dot(self.V_y[:, :self.hidden_size].T, dy)
        dh_backward = np.dot(self.V_y[:, self.hidden_size:].T, dy)

        for t in range(self.sequence_length - 1, -1, -1):
            do_forward = dh_forward * tanh(c_forward[:, t].reshape(-1, 1))
            dc_forward = dh_forward * (1 - tanh(c_forward[:, t].reshape(-1, 1)) ** 2)
            di_forward = dc_forward * tanh(np.dot(self.W_c, X[:, t].reshape(-1, 1)) + np.dot(self.U_c, h_forward[:, t-1].reshape(-1, 1)) + self.b_c)
            df_forward = dc_forward * c_forward[:, t-1].reshape(-1, 1)
            
            dW_o += np.dot(do_forward, X[:, t].reshape(1, -1))
            dU_o += np.dot(do_forward, h_forward[:, t-1].reshape(1, -1))
            db_o += do_forward

            dW_i += np.dot(di_forward, X[:, t].reshape(1, -1))
            dU_i += np.dot(di_forward, h_forward[:, t-1].reshape(1, -1))
            db_i += di_forward

            dW_f += np.dot(df_forward, X[:, t].reshape(1, -1))
            dU_f += np.dot(df_forward, h_forward[:, t-1].reshape(1, -1))
            db_f += df_forward

        for t in range(self.sequence_length):
            do_backward = dh_backward * tanh(c_backward[:, t].reshape(-1, 1))
            dc_backward = dh_backward * (1 - tanh(c_backward[:, t].reshape(-1, 1)) ** 2)
            di_backward = dc_backward * tanh(np.dot(self.W_c, X[:, t].reshape(-1, 1)) + np.dot(self.U_c, h_backward[:, t+1].reshape(-1, 1)) + self.b_c)
            df_backward = dc_backward * c_backward[:, t+1].reshape(-1, 1)
            
            dW_o += np.dot(do_backward, X[:, t].reshape(1, -1))
            dU_o += np.dot(do_backward, h_backward[:, t+1].reshape(1, -1))
            db_o += do_backward

            dW_i += np.dot(di_backward, X[:, t].reshape(1, -1))
            dU_i += np.dot(di_backward, h_backward[:, t+1].reshape(1, -1))
            db_i += di_backward

            dW_f += np.dot(df_backward, X[:, t].reshape(1, -1))
            dU_f += np.dot(df_backward, h_backward[:, t+1].reshape(1, -1))
            db_f += df_backward

        self.V_y -= learning_rate * dV_y
        self.b_y -= learning_rate * db_y

        self.W_f -= learning_rate * dW_f
        self.W_i -= learning_rate * dW_i
        self.W_c -= learning_rate * dW_c
        self.W_o -= learning_rate * dW_o

        self.U_f -= learning_rate * dU_f
        self.U_i -= learning_rate * dU_i
        self.U_c -= learning_rate * dU_c
        self.U_o -= learning_rate * dU_o

        self.b_f -= learning_rate * db_f
        self.b_i -= learning_rate * db_i
        self.b_c -= learning_rate * db_c
        self.b_o -= learning_rate * db_o

    def train(self, X, y, epochs, learning_rate):
        for epoch in range(epochs):
            y_pred, h_forward, h_backward, c_forward, c_backward = self.forward(X)
            self.backward(X, y, y_pred, h_forward, h_backward, c_forward, c_backward, learning_rate)
            if epoch % 10 == 0:
                loss = np.mean((y - y_pred) ** 2)
                print(f'Epoch {epoch}, Loss: {loss}')

# Example usage
input_size = 5
hidden_size = 10
output_size = 1
sequence_length = 20
learning_rate = 0.001
epochs = 100

# Dummy data
X = np.random.randn(input_size, sequence_length)
y = np.random.randint(0, 2, (output_size, 1))

# Create and train the model
rnn = BidirectionalRNN(input_size, hidden_size, output_size, sequence_length)
rnn.train(X, y, epochs, learning_rate)


(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10, 1)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)
(10,)


IndexError: index 20 is out of bounds for axis 1 with size 20

In [4]:
import numpy as np

# Example arrays
t1 = np.random.randn(10)  # Shape (10,)
t2 = np.random.randn(10, 10)  # Shape (10, 10)

# Case 1: Assigning a row
t1[:] = t2[0, :]  # Assign the first row of t2 to t1
print('After assigning row:', t1)

# Case 2: Assigning a column
t1[:] = t2[:, 0]  # Assign the first column of t2 to t1
print('After assigning column:', t1)

# Case 3: Flatten and take first 10 elements
t1 = t2.flatten()[:10]  # Flatten t2 and take first 10 elements
print('After flattening and slicing:', t1)

# Case 4: Reshaping and assigning
t2_reshaped = t2.reshape(-1, 1)[:10].flatten()  # Ensure it becomes (10,)
t1[:] = t2_reshaped
print('After reshaping and assigning:', t1.shape)

After assigning row: [ 1.28727281 -1.18136797  1.02094269 -1.84678749  1.27751904 -0.02640973
  0.2429473   0.56807779 -1.78637454  0.06674648]
After assigning column: [ 1.28727281  0.71817259 -0.42793566  0.78615642  0.81391894  1.25202391
 -1.24249576  0.07097557  0.20145723  0.30219445]
After flattening and slicing: [ 1.28727281 -1.18136797  1.02094269 -1.84678749  1.27751904 -0.02640973
  0.2429473   0.56807779 -1.78637454  0.06674648]
After reshaping and assigning: (10,)


In [6]:
import numpy as np

# Example dimensions
n_hidden = 10  # Number of hidden units
n_input = 5    # Number of input features
batch_size = 20  # Batch size, assumed here as 20 for the example

# Randomly initialize matrices for demonstration purposes
W_c = np.random.randn(n_hidden, n_input)
U_c = np.random.randn(n_hidden, n_hidden)
b_c = np.random.randn(n_hidden, 1)
X = np.random.randn(n_input, batch_size)  # Input batch of size 20
h_backward = np.random.randn(n_hidden, batch_size + 1)  # Hidden states including initial state
dc_backward = np.random.randn(n_hidden, batch_size)  # Example dc_backward

# Example timestep t (ensure t is within bounds)
t = 0  # Starting index, should be between 0 and batch_size-1

# Ensure t is within valid range
if t < 0 or t >= batch_size:
    raise ValueError(f"Time step t should be in range [0, {batch_size-1}]")

# Ensure all shapes are compatible
x_t_reshaped = X[:, t].reshape(-1, 1)  # Shape (n_input, 1)
h_b_t1_reshaped = h_backward[:, t+1].reshape(-1, 1)  # Shape (n_hidden, 1)

# Calculate dot products
dot_Wc_X = np.dot(W_c, x_t_reshaped)  # Shape (n_hidden, 1)
dot_Uc_hb = np.dot(U_c, h_b_t1_reshaped)  # Shape (n_hidden, 1)

# Sum the components and add bias
sum_dot_products = dot_Wc_X + dot_Uc_hb + b_c  # Shape (n_hidden, 1)

# Apply tanh
tanh_sum = np.tanh(sum_dot_products)  # Shape (n_hidden, 1)

# Element-wise multiply with dc_backward
di_backward = dc_backward[:, t].reshape(-1, 1) * tanh_sum  # Shape (n_hidden, 1)

# Printing for verification
print("dc_backward shape:", dc_backward[:, t].reshape(-1, 1).shape)
print("dot_Wc_X shape:", dot_Wc_X.shape)
print("dot_Uc_hb shape:", dot_Uc_hb.shape)
print("b_c shape:", b_c.shape)
print("sum_dot_products shape:", sum_dot_products.shape)
print("tanh_sum shape:", tanh_sum.shape)
print("di_backward shape:", di_backward.shape)

dc_backward shape: (10, 1)
dot_Wc_X shape: (10, 1)
dot_Uc_hb shape: (10, 1)
b_c shape: (10, 1)
sum_dot_products shape: (10, 1)
tanh_sum shape: (10, 1)
di_backward shape: (10, 1)


In [7]:
import numpy as np

class BidirectionalRNN:
    def __init__(self, input_size, hidden_size, output_size, seq_length):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.seq_length = seq_length

        # Initialize weights
        self.W_xh_forward = np.random.randn(hidden_size, input_size)
        self.W_hh_forward = np.random.randn(hidden_size, hidden_size)
        self.b_h_forward = np.zeros((hidden_size, 1))

        self.W_xh_backward = np.random.randn(hidden_size, input_size)
        self.W_hh_backward = np.random.randn(hidden_size, hidden_size)
        self.b_h_backward = np.zeros((hidden_size, 1))

        self.W_hy = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def forward(self, X):
        h_forward = np.zeros((self.hidden_size, self.seq_length))
        h_backward = np.zeros((self.hidden_size, self.seq_length))
        Y = np.zeros((self.output_size, self.seq_length))

        # Forward pass
        for t in range(self.seq_length):
            x_t = X[:, t].reshape(-1, 1)
            if t == 0:
                h_forward[:, t] = np.tanh(np.dot(self.W_xh_forward, x_t) + self.b_h_forward).flatten()
            else:
                h_forward[:, t] = np.tanh(np.dot(self.W_xh_forward, x_t) + np.dot(self.W_hh_forward, h_forward[:, t-1].reshape(-1, 1)) + self.b_h_forward).flatten()

        # Backward pass
        for t in reversed(range(self.seq_length)):
            x_t = X[:, t].reshape(-1, 1)
            if t == self.seq_length - 1:
                h_backward[:, t] = np.tanh(np.dot(self.W_xh_backward, x_t) + self.b_h_backward).flatten()
            else:
                h_backward[:, t] = np.tanh(np.dot(self.W_xh_backward, x_t) + np.dot(self.W_hh_backward, h_backward[:, t+1].reshape(-1, 1)) + self.b_h_backward).flatten()

        # Output layer
        for t in range(self.seq_length):
            h_concat = np.concatenate((h_forward[:, t], h_backward[:, t])).reshape(-1, 1)
            Y[:, t] = np.dot(self.W_hy, h_concat).flatten() + self.b_y.flatten()

        return Y

# Example usage
input_size = 10
hidden_size = 20
output_size = 5
seq_length = 15

X = np.random.randn(input_size, seq_length)

rnn = BidirectionalRNN(input_size, hidden_size, output_size, seq_length)
Y = rnn.forward(X)

print("Output Y:", Y)


Output Y: [[  1.30163509   7.07544671  -7.05588653   3.96047734   2.38241803
    5.17756802   1.56250443  -5.78920478  -4.41848448  -3.83722165
    3.43541901   6.49849044  20.19723144  -8.13955026   0.95239785]
 [ -2.69841934   2.29325906  -2.31885708   2.4509893   -5.4886059
   -9.08251322  -9.67782239   4.38749734   9.79895852   9.21639647
    5.704967    -6.16941026 -10.14077588  -0.18198193   6.23166498]
 [  1.13965882  -0.22977437  -1.67931229 -12.53216329  -7.33665196
   -5.82198187  -6.29288702   0.67500261  -0.89203674  10.32818604
  -11.68439997  -6.51663573  -3.45511747  -2.54123866   1.54084556]
 [ -6.9566914   -1.74641181  -7.60329237  -7.06665351   0.953177
    4.29658088  -1.33672038   6.47574353   6.30524103  -2.01889714
   -4.70632023  -3.92074232  -9.69582973  -1.554269    -0.40463853]
 [ -5.33752486   1.36616355   7.44670578   3.80909187  -4.37499531
   -7.14967499  -1.34761871  10.26028012  -0.15742397  13.67016264
    0.43516504  -1.50984267 -12.18702335  -2.346406

In [10]:
import numpy as np

class SimpleBidirectionalRNN:
    def __init__(self, input_size, hidden_size, output_size, seq_length, learning_rate=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.seq_length = seq_length
        self.learning_rate = learning_rate

        # Initialize weights
        self.W_xh_forward = np.random.randn(hidden_size, input_size)
        self.W_hh_forward = np.random.randn(hidden_size, hidden_size)
        self.b_h_forward = np.zeros((hidden_size, 1))

        self.W_xh_backward = np.random.randn(hidden_size, input_size)
        self.W_hh_backward = np.random.randn(hidden_size, hidden_size)
        self.b_h_backward = np.zeros((hidden_size, 1))

        self.W_hy = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def forward(self, X):
        self.h_forward = np.zeros((self.hidden_size, self.seq_length))
        self.h_backward = np.zeros((self.hidden_size, self.seq_length))
        self.Y = np.zeros((self.output_size, self.seq_length))

        # Forward pass
        for t in range(self.seq_length):
            x_t = X[:, t].reshape(-1, 1)
            if t == 0:
                self.h_forward[:, t] = np.tanh(np.dot(self.W_xh_forward, x_t) + self.b_h_forward).flatten()
            else:
                self.h_forward[:, t] = np.tanh(np.dot(self.W_xh_forward, x_t) + np.dot(self.W_hh_forward, self.h_forward[:, t-1].reshape(-1, 1)) + self.b_h_forward).flatten()

        # Backward pass
        for t in reversed(range(self.seq_length)):
            x_t = X[:, t].reshape(-1, 1)
            if t == self.seq_length - 1:
                self.h_backward[:, t] = np.tanh(np.dot(self.W_xh_backward, x_t) + self.b_h_backward).flatten()
            else:
                self.h_backward[:, t] = np.tanh(np.dot(self.W_xh_backward, x_t) + np.dot(self.W_hh_backward, self.h_backward[:, t+1].reshape(-1, 1)) + self.b_h_backward).flatten()

        # Output layer
        for t in range(self.seq_length):
            h_concat = np.concatenate((self.h_forward[:, t], self.h_backward[:, t])).reshape(-1, 1)
            self.Y[:, t] = (np.dot(self.W_hy, h_concat) + self.b_y).flatten()

        return self.Y

    def backward(self, X, Y_true):
        dW_xh_forward = np.zeros_like(self.W_xh_forward)
        dW_hh_forward = np.zeros_like(self.W_hh_forward)
        db_h_forward = np.zeros_like(self.b_h_forward)

        dW_xh_backward = np.zeros_like(self.W_xh_backward)
        dW_hh_backward = np.zeros_like(self.W_hh_backward)
        db_h_backward = np.zeros_like(self.b_h_backward)

        dW_hy = np.zeros_like(self.W_hy)
        db_y = np.zeros_like(self.b_y)

        dY = self.Y - Y_true

        for t in range(self.seq_length):
            dy = dY[:, t].reshape(-1, 1)
            h_concat = np.concatenate((self.h_forward[:, t], self.h_backward[:, t])).reshape(-1, 1)

            dW_hy += np.dot(dy, h_concat.T)
            db_y += dy

            dh_forward = np.dot(self.W_hy[:, :self.hidden_size].T, dy)
            for k in range(t, -1, -1):
                dh_forward = dh_forward * (1 - self.h_forward[:, k].reshape(-1, 1)**2)
                dW_xh_forward += np.dot(dh_forward, X[:, k].reshape(1, -1))
                db_h_forward += dh_forward
                if k != 0:
                    dW_hh_forward += np.dot(dh_forward, self.h_forward[:, k-1].reshape(1, -1))
                    dh_forward = np.dot(self.W_hh_forward.T, dh_forward)

            dh_backward = np.dot(self.W_hy[:, self.hidden_size:].T, dy)
            for k in range(t, self.seq_length):
                dh_backward = dh_backward * (1 - self.h_backward[:, k].reshape(-1, 1)**2)
                dW_xh_backward += np.dot(dh_backward, X[:, k].reshape(1, -1))
                db_h_backward += dh_backward
                if k != self.seq_length - 1:
                    dW_hh_backward += np.dot(dh_backward, self.h_backward[:, k+1].reshape(1, -1))
                    dh_backward = np.dot(self.W_hh_backward.T, dh_backward)

        self.W_xh_forward -= self.learning_rate * dW_xh_forward
        self.W_hh_forward -= self.learning_rate * dW_hh_forward
        self.b_h_forward -= self.learning_rate * db_h_forward

        self.W_xh_backward -= self.learning_rate * dW_xh_backward
        self.W_hh_backward -= self.learning_rate * dW_hh_backward
        self.b_h_backward -= self.learning_rate * db_h_backward

        self.W_hy -= self.learning_rate * dW_hy
        self.b_y -= self.learning_rate * db_y

    def train(self, X_train, Y_train, epochs):
        for epoch in range(epochs):
            loss = 0
            for X, Y in zip(X_train, Y_train):
                Y_pred = self.forward(X)
                self.backward(X, Y)
                loss += np.sum((Y_pred - Y) ** 2)
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss / len(X_train)}')

# Example usage
input_size = 10
hidden_size = 20
output_size = 5
seq_length = 15
learning_rate = 0.001
epochs = 10

# Generate random training data
X_train = [np.random.randn(input_size, seq_length) for _ in range(100)]
Y_train = [np.random.randn(output_size, seq_length) for _ in range(100)]

rnn = SimpleBidirectionalRNN(input_size, hidden_size, output_size, seq_length, learning_rate)
rnn.train(X_train, Y_train, epochs)


Epoch 1/10, Loss: 771.6388182602528
Epoch 2/10, Loss: 222.70415716326218
Epoch 3/10, Loss: 116.27330771888519
Epoch 4/10, Loss: 90.4364523962641
Epoch 5/10, Loss: 81.5900988975763
Epoch 6/10, Loss: 78.54778902845628
Epoch 7/10, Loss: 76.81286575409018
Epoch 8/10, Loss: 76.2742182929642
Epoch 9/10, Loss: 75.83716311236157
Epoch 10/10, Loss: 75.715586036493


In [2]:
import numpy as np

class SimpleBiLSTM:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # Initialize weights and biases for forward LSTM
        self.W_f = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f = np.zeros((hidden_size, 1))

        self.W_i = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i = np.zeros((hidden_size, 1))

        self.W_c = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c = np.zeros((hidden_size, 1))

        self.W_o = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o = np.zeros((hidden_size, 1))

        # Initialize weights and biases for backward LSTM
        self.W_f_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f_back = np.zeros((hidden_size, 1))

        self.W_i_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i_back = np.zeros((hidden_size, 1))

        self.W_c_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c_back = np.zeros((hidden_size, 1))

        self.W_o_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o_back = np.zeros((hidden_size, 1))

        # Initialize weights and biases for output layer
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def tanh(self, x):
        return np.tanh(x)

    def forward(self, X):
        T = X.shape[1]  # Length of the sequence

        # Initialize hidden and cell states for forward and backward passes
        self.h_forward = np.zeros((self.hidden_size, T))
        self.c_forward = np.zeros((self.hidden_size, T))
        self.h_backward = np.zeros((self.hidden_size, T))
        self.c_backward = np.zeros((self.hidden_size, T))

        # Forward pass
        for t in range(T):
            x_t = X[:, t].reshape(-1, 1)
            combined = np.vstack((self.h_forward[:, t-1].reshape(-1, 1), x_t)) if t > 0 else np.vstack((np.zeros((self.hidden_size, 1)), x_t))

            f_t = self.sigmoid(np.dot(self.W_f, combined) + self.b_f)
            i_t = self.sigmoid(np.dot(self.W_i, combined) + self.b_i)
            c_tilde = np.tanh(np.dot(self.W_c, combined) + self.b_c)
            self.c_forward[:, t] = f_t.flatten() * self.c_forward[:, t-1] + i_t.flatten() * c_tilde.flatten() if t > 0 else i_t.flatten() * c_tilde.flatten()
            o_t = self.sigmoid(np.dot(self.W_o, combined) + self.b_o)
            self.h_forward[:, t] = o_t.flatten() * np.tanh(self.c_forward[:, t])

        # Backward pass
        for t in reversed(range(T)):
            x_t = X[:, t].reshape(-1, 1)
            combined = np.vstack((self.h_backward[:, t+1].reshape(-1, 1), x_t)) if t < T-1 else np.vstack((np.zeros((self.hidden_size, 1)), x_t))

            f_t = self.sigmoid(np.dot(self.W_f_back, combined) + self.b_f_back)
            i_t = self.sigmoid(np.dot(self.W_i_back, combined) + self.b_i_back)
            c_tilde = np.tanh(np.dot(self.W_c_back, combined) + self.b_c_back)
            self.c_backward[:, t] = f_t.flatten() * self.c_backward[:, t+1] + i_t.flatten() * c_tilde.flatten() if t < T-1 else i_t.flatten() * c_tilde.flatten()
            o_t = self.sigmoid(np.dot(self.W_o_back, combined) + self.b_o_back)
            self.h_backward[:, t] = o_t.flatten() * np.tanh(self.c_backward[:, t])

        # Concatenate forward and backward hidden states
        self.h_concat = np.vstack((self.h_forward, self.h_backward))

        # Output layer
        y_pred = np.dot(self.W_y, self.h_concat) + self.b_y

        return y_pred

    def train(self, X_train, Y_train, epochs):
        for epoch in range(epochs):
            for X, Y_true in zip(X_train, Y_train):
                # Forward pass
                Y_pred = self.forward(X)

                # Backpropagation through time (BPTT) and gradient descent
                # Update weights using gradients (not shown for simplicity)
                pass

# Example usage:
input_size = 10
hidden_size = 20
output_size = 5
seq_length = 15
learning_rate = 0.001
epochs = 10

# Generate random training data
X_train = [np.random.randn(input_size, seq_length) for _ in range(100)]
Y_train = [np.random.randn(output_size, seq_length) for _ in range(100)]

# Create and train the BiLSTM model
bilstm = SimpleBiLSTM(input_size, hidden_size, output_size, learning_rate)
bilstm.train(X_train, Y_train, epochs)


In [5]:
import numpy as np

class SimpleBiLSTM:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # Initialize weights and biases for forward LSTM
        self.W_f = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f = np.zeros((hidden_size, 1))

        self.W_i = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i = np.zeros((hidden_size, 1))

        self.W_c = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c = np.zeros((hidden_size, 1))

        self.W_o = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o = np.zeros((hidden_size, 1))

        # Initialize weights and biases for backward LSTM
        self.W_f_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f_back = np.zeros((hidden_size, 1))

        self.W_i_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i_back = np.zeros((hidden_size, 1))

        self.W_c_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c_back = np.zeros((hidden_size, 1))

        self.W_o_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o_back = np.zeros((hidden_size, 1))

        # Initialize weights and biases for output layer
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def tanh(self, x):
        return np.tanh(x)

    def forward(self, X):
        T = X.shape[1]  # Length of the sequence

        # Initialize hidden and cell states for forward and backward passes
        h_forward = np.zeros((self.hidden_size, T))
        c_forward = np.zeros((self.hidden_size, T))
        h_backward = np.zeros((self.hidden_size, T))
        c_backward = np.zeros((self.hidden_size, T))

        # Forward pass
        for t in range(T):
            x_t = X[:, t].reshape(-1, 1)
            combined = np.concatenate((h_forward[:, t-1:t], x_t), axis=0) if t > 0 else np.concatenate((np.zeros((self.hidden_size, 1)), x_t), axis=0)

            f_t = self.sigmoid(np.dot(self.W_f, combined) + self.b_f)
            i_t = self.sigmoid(np.dot(self.W_i, combined) + self.b_i)
            c_tilde = self.tanh(np.dot(self.W_c, combined) + self.b_c)
            c_forward[:, t:t+1] = f_t * c_forward[:, t-1:t] + i_t * c_tilde if t > 0 else i_t * c_tilde
            o_t = self.sigmoid(np.dot(self.W_o, combined) + self.b_o)
            h_forward[:, t:t+1] = o_t * self.tanh(c_forward[:, t:t+1])

        # Backward pass
        for t in reversed(range(T)):
            x_t = X[:, t].reshape(-1, 1)
            combined = np.concatenate((h_backward[:, t+1:t+2], x_t), axis=0) if t < T-1 else np.concatenate((np.zeros((self.hidden_size, 1)), x_t), axis=0)

            f_t = self.sigmoid(np.dot(self.W_f_back, combined) + self.b_f_back)
            i_t = self.sigmoid(np.dot(self.W_i_back, combined) + self.b_i_back)
            c_tilde = self.tanh(np.dot(self.W_c_back, combined) + self.b_c_back)
            c_backward[:, t:t+1] = f_t * c_backward[:, t+1:t+2] + i_t * c_tilde if t < T-1 else i_t * c_tilde
            o_t = self.sigmoid(np.dot(self.W_o_back, combined) + self.b_o_back)
            h_backward[:, t:t+1] = o_t * self.tanh(c_backward[:, t:t+1])

        # Concatenate forward and backward hidden states
        h_concat = np.concatenate((h_forward, h_backward), axis=0)

        # Output layer
        y_pred = np.dot(self.W_y, h_concat) + self.b_y

        return y_pred, h_forward, c_forward, h_backward, c_backward

    def backward(self, X, Y_true, Y_pred, h_forward, c_forward, h_backward, c_backward):
        T = X.shape[1]
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)

        dh_forward = np.zeros_like(h_forward)
        dc_forward = np.zeros_like(c_forward)

        dh_backward = np.zeros_like(h_backward)
        dc_backward = np.zeros_like(c_backward)

        dW_f = np.zeros_like(self.W_f)
        db_f = np.zeros_like(self.b_f)

        dW_i = np.zeros_like(self.W_i)
        db_i = np.zeros_like(self.b_i)

        dW_c = np.zeros_like(self.W_c)
        db_c = np.zeros_like(self.b_c)

        dW_o = np.zeros_like(self.W_o)
        db_o = np.zeros_like(self.b_o)

        dW_f_back = np.zeros_like(self.W_f_back)
        db_f_back = np.zeros_like(self.b_f_back)

        dW_i_back = np.zeros_like(self.W_i_back)
        db_i_back = np.zeros_like(self.b_i_back)

        dW_c_back = np.zeros_like(self.W_c_back)
        db_c_back = np.zeros_like(self.b_c_back)

        dW_o_back = np.zeros_like(self.W_o_back)
        db_o_back = np.zeros_like(self.b_o_back)

        dy = Y_pred - Y_true

        dW_y += np.dot(dy, np.concatenate((h_forward, h_backward), axis=0).T)
        db_y += np.sum(dy, axis=1, keepdims=True)

        for t in reversed(range(T)):
            dy_forward = np.dot(self.W_y[:, :self.hidden_size].T, dy)
            dy_backward = np.dot(self.W_y[:, self.hidden_size:].T, dy)

            dh_forward[:, t] += dy_forward.flatten()
            dh_backward[:, t] += dy_backward.flatten()

            do_forward = dh_forward[:, t] * np.tanh(c_forward[:, t])
            dc_forward[:, t] += dh_forward[:, t] * self.sigmoid(h_forward[:, t]) * (1 - np.tanh(c_forward[:, t]) ** 2)

            do_backward = dh_backward[:, t] * np.tanh(c_backward[:, t])
            dc_backward[:, t] += dh_backward[:, t] * self.sigmoid(h_backward[:, t]) * (1 - np.tanh(c_backward[:, t]) ** 2)

            dW_o += np.dot(do_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), np.concatenate((h_forward[:, t-1:t], X[:, t:t+1]), axis=0).T)
            db_o += np.sum(do_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), axis=1, keepdims=True)

            dW_o_back += np.dot(do_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), np.concatenate((h_backward[:, t+1:t+2], X[:, t:t+1]), axis=0).T)
            db_o_back += np.sum(do_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), axis=1, keepdims=True)

            df_forward = dc_forward[:, t] * c_forward[:, t-1]
            di_forward = dc_forward[:, t] * np.tanh(c_forward[:, t])

            df_backward = dc_backward[:, t] * c_backward[:, t+1]
            di_backward = dc_backward[:, t] * np.tanh(c_backward[:, t])

            dW_f += np.dot(df_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), np.concatenate((h_forward[:, t-1:t], X[:, t:t+1]), axis=0).T)
            db_f += np.sum(df_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), axis=1, keepdims=True)

            dW_f_back += np.dot(df_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), np.concatenate((h_backward[:, t+1:t+2], X[:, t:t+1]), axis=0).T)
            db_f_back += np.sum(df_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), axis=1, keepdims=True)

            dW_i += np.dot(di_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), np.concatenate((h_forward[:, t-1:t], X[:, t:t+1]), axis=0).T)
            db_i += np.sum(di_forward * self.sigmoid(h_forward[:, t]) * (1 - self.sigmoid(h_forward[:, t])), axis=1, keepdims=True)

            dW_i_back += np.dot(di_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), np.concatenate((h_backward[:, t+1:t+2], X[:, t:t+1]), axis=0).T)
            db_i_back += np.sum(di_backward * self.sigmoid(h_backward[:, t]) * (1 - self.sigmoid(h_backward[:, t])), axis=1, keepdims=True)

            dc_forward[:, t-1] = dc_forward[:, t] * self.sigmoid(h_forward[:, t])
            dc_backward[:, t+1] = dc_backward[:, t] * self.sigmoid(h_backward[:, t])

            dh_forward[:, t-1] = np.dot(self.W_f[:, :self.hidden_size].T, df_forward) + np.dot(self.W_i[:, :self.hidden_size].T, di_forward)
            dh_backward[:, t+1] = np.dot(self.W_f_back[:, :self.hidden_size].T, df_backward) + np.dot(self.W_i_back[:, :self.hidden_size].T, di_backward)

        self.W_y -= self.learning_rate * dW_y
        self.b_y -= self.learning_rate * db_y

        self.W_o -= self.learning_rate * dW_o
        self.b_o -= self.learning_rate * db_o

        self.W_o_back -= self.learning_rate * dW_o_back
        self.b_o_back -= self.learning_rate * db_o_back

        self.W_f -= self.learning_rate * dW_f
        self.b_f -= self.learning_rate * db_f

        self.W_f_back -= self.learning_rate * dW_f_back
        self.b_f_back -= self.learning_rate * db_f_back

        self.W_i -= self.learning_rate * dW_i
        self.b_i -= self.learning_rate * db_i

        self.W_i_back -= self.learning_rate * dW_i_back
        self.b_i_back -= self.learning_rate * db_i_back

    def train(self, X_train, Y_train, epochs):
        for epoch in range(epochs):
            for X, Y_true in zip(X_train, Y_train):
                # Forward pass
                Y_pred, h_forward, c_forward, h_backward, c_backward = self.forward(X)

                # Backpropagation through time (BPTT) and gradient descent
                self.backward(X, Y_true, Y_pred, h_forward, c_forward, h_backward, c_backward)

# Example usage:
input_size = 10
hidden_size = 20
output_size = 5
seq_length = 15
learning_rate = 0.001
epochs = 10

# Generate random training data
X_train = [np.random.randn(input_size, seq_length) for _ in range(100)]
Y_train = [np.random.randn(output_size, seq_length) for _ in range(100)]

# Create and train the BiLSTM model
bilstm = SimpleBiLSTM(input_size, hidden_size, output_size, learning_rate)
bilstm.train(X_train, Y_train, epochs)


ValueError: operands could not be broadcast together with shapes (20,) (300,) (20,) 

In [7]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

class SimpleBiLSTM:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # Initialize weights and biases for forward LSTM
        self.W_f = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f = np.zeros((hidden_size, 1))

        self.W_i = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i = np.zeros((hidden_size, 1))

        self.W_c = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c = np.zeros((hidden_size, 1))

        self.W_o = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o = np.zeros((hidden_size, 1))

        # Initialize weights and biases for backward LSTM
        self.W_f_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f_back = np.zeros((hidden_size, 1))

        self.W_i_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i_back = np.zeros((hidden_size, 1))

        self.W_c_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c_back = np.zeros((hidden_size, 1))

        self.W_o_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o_back = np.zeros((hidden_size, 1))

        # Initialize output layer weights
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def forward_step(self, x_t, h_prev, c_prev, W_f, W_i, W_c, W_o, b_f, b_i, b_c, b_o):
        concat = np.vstack((h_prev, x_t))
        
        f_t = sigmoid(np.dot(W_f, concat) + b_f)
        i_t = sigmoid(np.dot(W_i, concat) + b_i)
        c_hat_t = np.tanh(np.dot(W_c, concat) + b_c)
        c_t = f_t * c_prev + i_t * c_hat_t
        o_t = sigmoid(np.dot(W_o, concat) + b_o)
        h_t = o_t * np.tanh(c_t)
        
        return h_t, c_t

    def forward(self, X):
        T = X.shape[1]
        
        h_forward = np.zeros((self.hidden_size, T))
        c_forward = np.zeros((self.hidden_size, T))
        
        h_backward = np.zeros((self.hidden_size, T))
        c_backward = np.zeros((self.hidden_size, T))

        h_prev_forward = np.zeros((self.hidden_size, 1))
        c_prev_forward = np.zeros((self.hidden_size, 1))
        
        h_prev_backward = np.zeros((self.hidden_size, 1))
        c_prev_backward = np.zeros((self.hidden_size, 1))

        # Forward pass
        for t in range(T):
            h_t, c_t = self.forward_step(X[:, t].reshape(-1, 1), h_prev_forward, c_prev_forward,
                                         self.W_f, self.W_i, self.W_c, self.W_o,
                                         self.b_f, self.b_i, self.b_c, self.b_o)
            h_forward[:, t] = h_t.flatten()
            c_forward[:, t] = c_t.flatten()
            h_prev_forward, c_prev_forward = h_t, c_t

        # Backward pass
        for t in reversed(range(T)):
            h_t, c_t = self.forward_step(X[:, t].reshape(-1, 1), h_prev_backward, c_prev_backward,
                                         self.W_f_back, self.W_i_back, self.W_c_back, self.W_o_back,
                                         self.b_f_back, self.b_i_back, self.b_c_back, self.b_o_back)
            h_backward[:, t] = h_t.flatten()
            c_backward[:, t] = c_t.flatten()
            h_prev_backward, c_prev_backward = h_t, c_t

        # Concatenate forward and backward hidden states
        h_concat = np.vstack((h_forward, h_backward))
        
        # Output layer
        y = sigmoid(np.dot(self.W_y, h_concat) + self.b_y)
        
        return y

    def backward(self, X, Y, y_pred):
        T = X.shape[1]
        
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)
        
        dW_f = np.zeros_like(self.W_f)
        dW_i = np.zeros_like(self.W_i)
        dW_c = np.zeros_like(self.W_c)
        dW_o = np.zeros_like(self.W_o)
        db_f = np.zeros_like(self.b_f)
        db_i = np.zeros_like(self.b_i)
        db_c = np.zeros_like(self.b_c)
        db_o = np.zeros_like(self.b_o)
        
        dW_f_back = np.zeros_like(self.W_f_back)
        dW_i_back = np.zeros_like(self.W_i_back)
        dW_c_back = np.zeros_like(self.W_c_back)
        dW_o_back = np.zeros_like(self.W_o_back)
        db_f_back = np.zeros_like(self.b_f_back)
        db_i_back = np.zeros_like(self.b_i_back)
        db_c_back = np.zeros_like(self.b_c_back)
        db_o_back = np.zeros_like(self.b_o_back)

        dy = y_pred - Y

        for t in range(T):
            h_concat = np.vstack((self.forward[:, t], self.h_backward[:, t])).reshape(-1, 1)
            dW_y += np.dot(dy[:, t].reshape(-1, 1), h_concat.T)
            db_y += dy[:, t].reshape(-1, 1)

        # Backpropagation through time (BPTT) for LSTM (Forward and Backward)
        # (Update weights using gradients not shown for simplicity)
        
        # Update weights (example for W_f)
        self.W_f -= self.learning_rate * dW_f
        self.b_f -= self.learning_rate * db_f

        self.W_i -= self.learning_rate * dW_i
        self.b_i -= self.learning_rate * db_i

        self.W_c -= self.learning_rate * dW_c
        self.b_c -= self.learning_rate * db_c

        self.W_o -= self.learning_rate * dW_o
        self.b_o -= self.learning_rate * db_o

        self.W_f_back -= self.learning_rate * dW_f_back
        self.b_f_back -= self.learning_rate * db_f_back

        self.W_i_back -= self.learning_rate * dW_i_back
        self.b_i_back -= self.learning_rate * db_i_back

        self.W_c_back -= self.learning_rate * dW_c_back
        self.b_c_back -= self.learning_rate * db_c_back

        self.W_o_back -= self.learning_rate * dW_o_back
        self.b_o_back -= self.learning_rate * db_o_back

        self.W_y -= self.learning_rate * dW_y
        self.b_y -= self.learning_rate * db_y

    def train(self, X, Y, epochs=100):
        for epoch in range(epochs):
            y_pred = self.forward(X)
            self.backward(X, Y, y_pred)
            if epoch % 10 == 0:
                loss = np.mean((Y - y_pred) ** 2)
                print(f'Epoch {epoch}, Loss: {loss}')

# Example usage with random input and output
input_size = 10
hidden_size = 20
output_size = 2

X = np.random.randn(input_size, 5)  # Random input with 5 time steps
Y = np.random.randint(0, 2, (output_size, 5))  # Random binary output

bilstm = SimpleBiLSTM(input_size, hidden_size, output_size)
bilstm.train(X, Y)


TypeError: 'method' object is not subscriptable

In [8]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

class SimpleBiLSTM:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # Initialize weights and biases for forward LSTM
        self.W_f = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f = np.zeros((hidden_size, 1))

        self.W_i = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i = np.zeros((hidden_size, 1))

        self.W_c = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c = np.zeros((hidden_size, 1))

        self.W_o = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o = np.zeros((hidden_size, 1))

        # Initialize weights and biases for backward LSTM
        self.W_f_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_f_back = np.zeros((hidden_size, 1))

        self.W_i_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_i_back = np.zeros((hidden_size, 1))

        self.W_c_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_c_back = np.zeros((hidden_size, 1))

        self.W_o_back = np.random.randn(hidden_size, hidden_size + input_size)
        self.b_o_back = np.zeros((hidden_size, 1))

        # Initialize output layer weights
        self.W_y = np.random.randn(output_size, 2 * hidden_size)
        self.b_y = np.zeros((output_size, 1))

    def forward_step(self, x_t, h_prev, c_prev, W_f, W_i, W_c, W_o, b_f, b_i, b_c, b_o):
        concat = np.vstack((h_prev, x_t))
        
        f_t = sigmoid(np.dot(W_f, concat) + b_f)
        i_t = sigmoid(np.dot(W_i, concat) + b_i)
        c_hat_t = np.tanh(np.dot(W_c, concat) + b_c)
        c_t = f_t * c_prev + i_t * c_hat_t
        o_t = sigmoid(np.dot(W_o, concat) + b_o)
        h_t = o_t * np.tanh(c_t)
        
        return h_t, c_t

    def forward(self, X):
        T = X.shape[1]
        
        self.h_forward = np.zeros((self.hidden_size, T))
        self.c_forward = np.zeros((self.hidden_size, T))
        
        self.h_backward = np.zeros((self.hidden_size, T))
        self.c_backward = np.zeros((self.hidden_size, T))

        h_prev_forward = np.zeros((self.hidden_size, 1))
        c_prev_forward = np.zeros((self.hidden_size, 1))
        
        h_prev_backward = np.zeros((self.hidden_size, 1))
        c_prev_backward = np.zeros((self.hidden_size, 1))

        # Forward pass
        for t in range(T):
            h_t, c_t = self.forward_step(X[:, t].reshape(-1, 1), h_prev_forward, c_prev_forward,
                                         self.W_f, self.W_i, self.W_c, self.W_o,
                                         self.b_f, self.b_i, self.b_c, self.b_o)
            self.h_forward[:, t] = h_t.flatten()
            self.c_forward[:, t] = c_t.flatten()
            h_prev_forward, c_prev_forward = h_t, c_t

        # Backward pass
        for t in reversed(range(T)):
            h_t, c_t = self.forward_step(X[:, t].reshape(-1, 1), h_prev_backward, c_prev_backward,
                                         self.W_f_back, self.W_i_back, self.W_c_back, self.W_o_back,
                                         self.b_f_back, self.b_i_back, self.b_c_back, self.b_o_back)
            self.h_backward[:, t] = h_t.flatten()
            self.c_backward[:, t] = c_t.flatten()
            h_prev_backward, c_prev_backward = h_t, c_t

        # Concatenate forward and backward hidden states
        h_concat = np.vstack((self.h_forward, self.h_backward))
        
        # Output layer
        y = sigmoid(np.dot(self.W_y, h_concat) + self.b_y)
        
        return y

    def backward(self, X, Y, y_pred):
        T = X.shape[1]
        
        dW_y = np.zeros_like(self.W_y)
        db_y = np.zeros_like(self.b_y)
        
        dW_f = np.zeros_like(self.W_f)
        dW_i = np.zeros_like(self.W_i)
        dW_c = np.zeros_like(self.W_c)
        dW_o = np.zeros_like(self.W_o)
        db_f = np.zeros_like(self.b_f)
        db_i = np.zeros_like(self.b_i)
        db_c = np.zeros_like(self.b_c)
        db_o = np.zeros_like(self.b_o)
        
        dW_f_back = np.zeros_like(self.W_f_back)
        dW_i_back = np.zeros_like(self.W_i_back)
        dW_c_back = np.zeros_like(self.W_c_back)
        dW_o_back = np.zeros_like(self.W_o_back)
        db_f_back = np.zeros_like(self.b_f_back)
        db_i_back = np.zeros_like(self.b_i_back)
        db_c_back = np.zeros_like(self.b_c_back)
        db_o_back = np.zeros_like(self.b_o_back)

        dy = y_pred - Y

        for t in range(T):
            h_concat = np.vstack((self.h_forward[:, t], self.h_backward[:, t])).reshape(-1, 1)
            dW_y += np.dot(dy[:, t].reshape(-1, 1), h_concat.T)
            db_y += dy[:, t].reshape(-1, 1)

        # Backpropagation through time (BPTT) for LSTM (Forward and Backward)
        # (Update weights using gradients not shown for simplicity)
        
        # Update weights (example for W_f)
        self.W_f -= self.learning_rate * dW_f
        self.b_f -= self.learning_rate * db_f

        self.W_i -= self.learning_rate * dW_i
        self.b_i -= self.learning_rate * db_i

        self.W_c -= self.learning_rate * dW_c
        self.b_c -= self.learning_rate * db_c

        self.W_o -= self.learning_rate * dW_o
        self.b_o -= self.learning_rate * db_o

        self.W_f_back -= self.learning_rate * dW_f_back
        self.b_f_back -= self.learning_rate * db_f_back

        self.W_i_back -= self.learning_rate * dW_i_back
        self.b_i_back -= self.learning_rate * db_i_back

        self.W_c_back -= self.learning_rate * dW_c_back
        self.b_c_back -= self.learning_rate * db_c_back

        self.W_o_back -= self.learning_rate * dW_o_back
        self.b_o_back -= self.learning_rate * db_o_back

        self.W_y -= self.learning_rate * dW_y
        self.b_y -= self.learning_rate * db_y

    def train(self, X, Y, epochs=100):
        for epoch in range(epochs):
            y_pred = self.forward(X)
            self.backward(X, Y, y_pred)
            if epoch % 10 == 0:
                loss = np.mean((Y - y_pred) ** 2)
                print(f'Epoch {epoch}, Loss: {loss}')

# Example usage with random input and output
input_size = 10
hidden_size = 20
output_size = 2

X = np.random.randn(input_size, 5)  # Random input with 5 time steps
Y = np.random.randint(0, 2, (output_size, 5))  # Random binary output

bilstm = SimpleBiLSTM(input_size, hidden_size, output_size)
bilstm.train(X, Y)


Epoch 0, Loss: 0.44897024083686593
Epoch 10, Loss: 0.4447017142233314
Epoch 20, Loss: 0.44046608249831304
Epoch 30, Loss: 0.43626574772327753
Epoch 40, Loss: 0.4321029901705102
Epoch 50, Loss: 0.42797996053771137
Epoch 60, Loss: 0.42389867320455143
Epoch 70, Loss: 0.4198610005948521
Epoch 80, Loss: 0.4158686686857023
Epoch 90, Loss: 0.41192325368289273


### Connectionist Temporal Classification (CTC)

CTC addresses the challenges of labeling unsegmented sequence data directly with RNNs. Here's a brief overview:

Given a set \( S \) of training examples where each example consists of a pair of sequences \( (x, z) \) with \( x = (x_1, \ldots, x_T) \) and \( z = (z_1, \ldots, z_U) \), where \( U \leq T \), the sequences \( x \) and \( z \) are not aligned initially due to their differing lengths.

#### CTC Method:

1. **Softmax Layer**: A softmax layer is used to define an output distribution $$ P(k|t) $$ at each time step \( t \) along the input sequence \( x \). This distribution covers \( K \) phonemes plus an extra blank symbol \( \emptyset \), making the softmax layer size \( K + 1 \).

2. **Bidirectional RNNs**: Typically, RNNs trained with CTC are bidirectional to ensure each $$ P(k|t) $$ depends on the entire input sequence \( x \).

3. **Output Vector**: The unnormalized output vector $$ y_t $$ at time step \( t \) is defined as:
   $$ y_t = W_{\rightarrow N} h_{\rightarrow N}^t + W_{\leftarrow N} h_{\leftarrow N}^t + b_y $$
   where:
   - \( W_{\rightarrow N} \) and \( W_{\leftarrow N} \) are weight matrices from forward and backward hidden states to the output gate.
   - \( h_{\rightarrow N}^t \) and \( h_{\leftarrow N}^t \) are forward and backward hidden states at time step \( t \).
   - \( b_y \) is the bias.

4. **Output Probability $$ P(k|t) $$**:
   $$ P(k|t) = \frac{\exp(y_t[k])}{\sum_{k'=1}^{K+1} \exp(y_t[k'])} $$
   where \( y_t[k] \) is the \( k \)-th element of \( y_t \).

### S-LSTM Network

The S-LSTM network extends LSTM to handle longer-term dependencies and more complex input structures:

#### S-LSTM Memory Block:

Each S-LSTM memory block contains:
- One input gate $$ i_t $$
- One output gate $$ o_t $$
- Multiple forget gates depending on the number of children of a node

#### Forward Computation:

1. **Input Gate $$ i_t $$**:
   $$ i_t = \sigma(W_{Lh}^i h_{Lt-1} + W_{Rh}^i h_{Rt-1} + W_{Lc}^i c_{Lt-1} + W_{Rc}^i c_{Rt-1} + b_i) $$
   where:
   - \( \sigma \) is the logistic sigmoid function.
   - \( h_{Lt-1}, h_{Rt-1} \) are hidden vectors of the left and right children.
   - \( c_{Lt-1}, c_{Rt-1} \) are cell vectors of the left and right children.
   - \( W \) and \( b \) are weight matrices and biases.

These networks enhance the capabilities of traditional RNNs and LSTMs by addressing alignment issues and handling more complex input structures efficiently.



In [9]:
import numpy as np

class CTC:
    def __init__(self, input_size, num_classes):
        self.input_size = input_size
        self.num_classes = num_classes
        self.softmax = np.zeros((num_classes + 1, input_size))  # Including blank symbol

    def forward(self, x):
        # Assuming x is the input sequence tensor of shape (input_size, sequence_length)
        sequence_length = x.shape[1]
        outputs = np.zeros((self.num_classes + 1, sequence_length))

        for t in range(sequence_length):
            # Example softmax computation (replace with actual logits)
            logits_t = np.random.rand(self.num_classes + 1)
            outputs[:, t] = np.exp(logits_t) / np.sum(np.exp(logits_t))

        return outputs

# Example usage:
input_size = 10
num_classes = 5
ctc = CTC(input_size, num_classes)

# Example input sequence (10-dimensional input, length 20)
input_sequence = np.random.rand(input_size, 20)

# Forward pass
output_sequence = ctc.forward(input_sequence)
print(output_sequence.shape)  # Example output shape


(6, 20)


In [13]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class S_LSTM_Cell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weights = {
            'W_lh': np.random.randn(hidden_size, input_size),
            'W_rh': np.random.randn(hidden_size, input_size),
            'W_lc': np.random.randn(hidden_size, input_size),
            'W_rc': np.random.randn(hidden_size, input_size)
        }
        self.biases = {
            'b_i': np.zeros((hidden_size, 1))
        }

    def forward(self, h_l_prev, h_r_prev, c_l_prev, c_r_prev):
        i_t = sigmoid(
            np.dot(self.weights['W_lh'], h_l_prev) +
            np.dot(self.weights['W_rh'], h_r_prev) +
            np.dot(self.weights['W_lc'], c_l_prev) +
            np.dot(self.weights['W_rc'], c_r_prev) +
            self.biases['b_i']
        )
        return i_t

# Example usage:
input_size = 5  # Example input size
hidden_size = 10  # Example hidden size
s_lstm_cell = S_LSTM_Cell(input_size, hidden_size)

# Example inputs (previous hidden and cell states)
h_l_prev = np.random.randn(hidden_size, 1)
h_r_prev = np.random.randn(hidden_size, 1)
c_l_prev = np.random.randn(hidden_size, 1)
c_r_prev = np.random.randn(hidden_size, 1)

# Forward pass through S-LSTM cell
i_t = s_lstm_cell.forward(h_l_prev, h_r_prev, c_l_prev, c_r_prev)
print(i_t.shape)  # Example output shape


ValueError: shapes (10,5) and (10,1) not aligned: 5 (dim 1) != 10 (dim 0)