In [1]:
import numpy as np

LSTM Layer

In [2]:
# LSTM Layer
class Layer_LSTM:
    def __init__(self, n_inputs, n_neurons, n_outputs):
        # Initialize weights for input, forget, cell, and output gates
        self.weights_input = 0.01 * np.random.randn(n_inputs, n_neurons * 4)
        self.weights_hidden = 0.01 * np.random.randn(n_neurons, n_neurons * 4)
        self.biases = np.zeros((1, n_neurons * 4))

        # Output layer weights
        self.weights_output = 0.01 * np.random.randn(n_neurons, n_outputs)
        self.bias_output = np.zeros((1, n_outputs))

        # Store hidden and cell states
        self.hidden_state = np.zeros((1, n_neurons))
        self.cell_state = np.zeros((1, n_neurons))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        
        # Compute gate activations
        gates = (np.dot(inputs, self.weights_input) +
                 np.dot(self.hidden_state, self.weights_hidden) + 
                 self.biases)

        # Split gate outputs
        i_gate, f_gate, c_gate, o_gate = np.split(gates, 4, axis=1)

        # Apply activations
        i_gate = self.sigmoid(i_gate)  # Input gate
        f_gate = self.sigmoid(f_gate)  # Forget gate
        c_gate = np.tanh(c_gate)       # Candidate cell state
        o_gate = self.sigmoid(o_gate)  # Output gate

        # Update cell state
        self.cell_state = f_gate * self.cell_state + i_gate * c_gate

        # Compute hidden state
        self.hidden_state = o_gate * np.tanh(self.cell_state)

        # Output layer computation
        self.output = np.dot(self.hidden_state, self.weights_output) + self.bias_output

    # Backward pass
    def backward(self, dvalues):
        # Gradients for output layer
        self.dweights_output = np.dot(self.hidden_state.T, dvalues)
        self.dbias_output = np.sum(dvalues, axis=0, keepdims=True)

        # Gradient for hidden state
        d_hidden = np.dot(dvalues, self.weights_output.T)

        # Compute gate gradients
        d_o_gate = d_hidden * np.tanh(self.cell_state)
        d_cell_state = d_hidden * self.sigmoid(d_o_gate) * (1 - np.tanh(self.cell_state) ** 2)

        # Compute gradients for input, forget, and candidate gates
        d_i_gate = d_cell_state * np.tanh(self.cell_state)
        d_f_gate = d_cell_state * self.cell_state
        d_c_gate = d_cell_state * self.sigmoid(d_i_gate)

        # Apply activations' derivative
        d_i_gate *= self.sigmoid(d_i_gate) * (1 - self.sigmoid(d_i_gate))
        d_f_gate *= self.sigmoid(d_f_gate) * (1 - self.sigmoid(d_f_gate))
        d_o_gate *= self.sigmoid(d_o_gate) * (1 - self.sigmoid(d_o_gate))
        d_c_gate *= 1 - np.tanh(d_c_gate) ** 2

        # Combine gate gradients
        d_gates = np.hstack((d_i_gate, d_f_gate, d_c_gate, d_o_gate))

        # Compute input and hidden weight gradients
        self.dweights_input = np.dot(self.inputs.T, d_gates)
        self.dweights_hidden = np.dot(self.hidden_state.T, d_gates)
        self.dbiases = np.sum(d_gates, axis=0, keepdims=True)

        # Compute input gradient for previous time step
        self.dinputs = np.dot(d_gates, self.weights_input.T)

Softmax

In [3]:
# Softmax Activation for Output Layer
class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        self.output = exp_values / np.sum(exp_values, axis=1, keepdims=True)


In [4]:
# Loss Function (Cross Entropy for Classification)
class Loss_CategoricalCrossentropy:
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        correct_confidences = y_pred_clipped[range(samples), y_true]
        return -np.log(correct_confidences)



In [6]:
import numpy as np

class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, lstm_layer):
        if not hasattr(lstm_layer, 'weight_momentums_input'):
            # Initialize momentums and caches for input weights
            lstm_layer.weight_momentums_input = np.zeros_like(lstm_layer.weights_input)
            lstm_layer.weight_cache_input = np.zeros_like(lstm_layer.weights_input)

            # Initialize momentums and caches for hidden weights
            lstm_layer.weight_momentums_hidden = np.zeros_like(lstm_layer.weights_hidden)
            lstm_layer.weight_cache_hidden = np.zeros_like(lstm_layer.weights_hidden)

            # Initialize momentums and caches for biases
            lstm_layer.bias_momentums = np.zeros_like(lstm_layer.biases)
            lstm_layer.bias_cache = np.zeros_like(lstm_layer.biases)

        # Compute momentums for input weights
        lstm_layer.weight_momentums_input = self.beta_1 * lstm_layer.weight_momentums_input + \
                                            (1 - self.beta_1) * lstm_layer.dweights_input
        # Compute momentums for hidden weights
        lstm_layer.weight_momentums_hidden = self.beta_1 * lstm_layer.weight_momentums_hidden + \
                                             (1 - self.beta_1) * lstm_layer.dweights_hidden
        # Compute momentums for biases
        lstm_layer.bias_momentums = self.beta_1 * lstm_layer.bias_momentums + \
                                    (1 - self.beta_1) * lstm_layer.dbiases

        # Bias correction for momentums
        weight_momentums_input_corrected = lstm_layer.weight_momentums_input / (1 - self.beta_1 ** (self.iterations + 1))
        weight_momentums_hidden_corrected = lstm_layer.weight_momentums_hidden / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = lstm_layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Compute cache updates for input weights
        lstm_layer.weight_cache_input = self.beta_2 * lstm_layer.weight_cache_input + \
                                        (1 - self.beta_2) * lstm_layer.dweights_input ** 2
        # Compute cache updates for hidden weights
        lstm_layer.weight_cache_hidden = self.beta_2 * lstm_layer.weight_cache_hidden + \
                                         (1 - self.beta_2) * lstm_layer.dweights_hidden ** 2
        # Compute cache updates for biases
        lstm_layer.bias_cache = self.beta_2 * lstm_layer.bias_cache + \
                                (1 - self.beta_2) * lstm_layer.dbiases ** 2

        # Bias correction for cache values
        weight_cache_input_corrected = lstm_layer.weight_cache_input / (1 - self.beta_2 ** (self.iterations + 1))
        weight_cache_hidden_corrected = lstm_layer.weight_cache_hidden / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = lstm_layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Update input weights
        lstm_layer.weights_input -= self.current_learning_rate * weight_momentums_input_corrected / \
                                    (np.sqrt(weight_cache_input_corrected) + self.epsilon)
        # Update hidden weights
        lstm_layer.weights_hidden -= self.current_learning_rate * weight_momentums_hidden_corrected / \
                                     (np.sqrt(weight_cache_hidden_corrected) + self.epsilon)
        # Update biases
        lstm_layer.biases -= self.current_learning_rate * bias_momentums_corrected / \
                             (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


In [7]:
# Create LSTM Layer
lstm_layer = Layer_LSTM(n_inputs=3, n_neurons=5, n_outputs=2)

# Activation & Loss
activation_softmax = Activation_Softmax()
loss_function = Loss_CategoricalCrossentropy()

# Optimizer
optimizer = Optimizer_Adam(learning_rate=0.01)

# Dummy data
X = np.random.randn(10, 3)  # 10 samples, 3 features
y = np.random.randint(0, 2, size=(10,))  # 10 target labels (0 or 1)

# Training loop
for epoch in range(1000):
    lstm_layer.forward(X)
    activation_softmax.forward(lstm_layer.output)
    loss = loss_function.forward(activation_softmax.output, y)

    # Get predictions (class with highest probability)
    predictions = np.argmax(activation_softmax.output, axis=1)
    accuracy = np.mean(predictions == y)

    # Backward pass
    lstm_layer.backward(activation_softmax.output - np.eye(2)[y])

    # Update weights
    optimizer.update_params(lstm_layer)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Accuracy: {accuracy:.4f}, Loss: {loss.mean()}")

# Test on a new sample
X_test = np.array([[0.7, 0.7, -0.2]])
lstm_layer.forward(X_test)
activation_softmax.forward(lstm_layer.output)
predictions = np.argmax(activation_softmax.output, axis=1)

print("Predicted Class:", predictions)


Epoch 0, Accuracy: 0.6000, Loss: 0.6931481759806717
Epoch 100, Accuracy: 0.9000, Loss: 0.6734842418364076
Epoch 200, Accuracy: 0.9000, Loss: 0.673607035232079
Epoch 300, Accuracy: 0.9000, Loss: 0.6736678073772002
Epoch 400, Accuracy: 0.9000, Loss: 0.673704627841007
Epoch 500, Accuracy: 0.9000, Loss: 0.6737290448039464
Epoch 600, Accuracy: 0.9000, Loss: 0.6737459934245937
Epoch 700, Accuracy: 0.9000, Loss: 0.67375804262496
Epoch 800, Accuracy: 0.9000, Loss: 0.673766711951876
Epoch 900, Accuracy: 0.9000, Loss: 0.6737729823732204
Predicted Class: [0 1 0 0 0 0 0 0 0 0]
