In [None]:
######################################################################################
'''Copyright (c) 2004- 2024 , Prof. Radhamadhab Dalai Odisha, India
Author's email address :  rmdi115@gmail.com'''
###################################################################################

###Softmax Regression and Softmax Function
### Softmax Function
The softmax function is used to convert a vector of raw scores (logits) into probabilities. For a vector \( z \) of length \( K \), the softmax function is defined as:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{k=1}^{K} e^{z_k}}
$

where:
- $ \sigma(z)_j $ is the probability of the  j-th class.
- $ z_j $ is the j-th element of the input vector z.
- K is the number of classes.

### Cross-Entropy Loss for Multi-Class Classification
The cross-entropy loss for multi-class classification is defined as:

$
H(y, \hat{y}) = - \frac{1}{T} \sum_{t=1}^{T} \sum_{k=1}^{K} y_{t,k} \log(\hat{y}_{t,k})
$

where:
-  T is the total number of samples.
-  K is the number of classes.
-  $ y_{t,k} $ is a binary indicator (0 or 1) if class label k is the correct classification for sample t.
- $ \hat{y}_{t,k} $ is the predicted probability of class k for sample t.

### Gradient of the Cross-Entropy Loss Function
The gradient of the cross-entropy loss function with respect to the predicted probabilities $ \hat{y}_t $ is:

$
\frac{\partial H(y, \hat{y})}{\partial \hat{y}_t} = \frac{\hat{y}_t - y_t}{T}
$

The gradient with respect to the weights w is:

$
\nabla_w H(y, \hat{y}) = \frac{1}{T} \sum_{t=1}^{T} ( \hat{y}_t - y_t ) x_t
$

### Weight Update Rule
The weight update rule using gradient descent for softmax regression is:

$
w^{k+1} = w^k - \eta \nabla_w H(y, \hat{y})
$

where:
- $ \eta $ is the learning rate.
- $ w^k $ is the weight vector at iteration k.
- $ \nabla_w H(y, \hat{y}) $ is the gradient of the KL divergence (cross-entropy loss) with respect to the weights.




In [2]:
import numpy as np

# Softmax function
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # stability improvement
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Cross-entropy loss function for softmax regression
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Avoid log(0)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Gradient of the cross-entropy loss function
def gradient_cross_entropy_loss(X, y_true, y_pred):
    return np.dot(X.T, (y_pred - y_true)) / X.shape[0]

# Softmax regression training function
def softmax_regression_train(X, y, learning_rate, epochs):
    n_samples, n_features = X.shape
    n_classes = y.shape[1]
    weights = np.zeros((n_features, n_classes))
    bias = np.zeros(n_classes)

    for _ in range(epochs):
        # Linear combination of features and weights
        linear_model = np.dot(X, weights) + bias

        # Predictions using softmax function
        y_pred = softmax(linear_model)

        # Compute the gradient of the loss function
        gradient = gradient_cross_entropy_loss(X, y, y_pred)
        bias_gradient = np.mean(y_pred - y, axis=0)

        # Update weights and bias using gradient descent
        weights -= learning_rate * gradient
        bias -= learning_rate * bias_gradient

    return weights, bias

# Softmax regression prediction function
def softmax_regression_predict(X, weights, bias):
    linear_model = np.dot(X, weights) + bias
    return softmax(linear_model)

# Example data (one-hot encoded labels)
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])

# Training
learning_rate = 0.01
epochs = 1000
weights, bias = softmax_regression_train(X, y, learning_rate, epochs)

# Prediction
y_pred = softmax_regression_predict(X, weights, bias)
print("Predictions:", y_pred)

# Loss
loss = cross_entropy_loss(y, y_pred)
print("Cross-entropy loss:", loss)


Predictions: [[0.68065166 0.18928148 0.13006686]
 [0.49600597 0.26347298 0.24052105]
 [0.30814999 0.31266358 0.37918643]
 [0.1649971  0.3197845  0.5152184 ]]
Cross-entropy loss: 0.727915949036652


### Softmax Regression

Given a set of training samples {($x_1$, $y_1$), $\ldots$, ($x_T$, $y_T$)}, where $x_t \in \mathbb{R}^n$ and $y_t \in \{1, \ldots, k\} $, we design a weight matrix W:

$
W = [w_1, w_2, \ldots, w_k]
$

### Softmax Function

The softmax function converts a vector of raw scores (logits) into probabilities. For a vector z of length k, the softmax function is defined as:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{k=1}^{K} e^{z_k}}
$

where:
- $ \sigma(z)_j $ is the probability of the j-th class.
- $ z_j $ is the j-th element of the input vector z.
- k is the number of classes.

### Cross-Entropy Loss for Multi-Class Classification

The cross-entropy loss for multi-class classification is defined as:

$
H(y, \hat{y}) = - \frac{1}{T} \sum_{t=1}^{T} \sum_{i=1}^{k} y_{t,i} \log(\hat{y}_{t,i})
$

where:
- T  is the total number of samples.
- k  is the number of classes.
- y_{t,i}  is a binary indicator (0 or 1) if class label i is the correct classification for sample t .
- $\hat{y}_{t,i} $ is the predicted probability of class i for sample t.

### Gradient of the Cross-Entropy Loss Function

The gradient of the cross-entropy loss function with respect to the predicted probabilities $\hat{y}_t$ is:

$
\frac{\partial H(y, \hat{y})}{\partial \hat{y}_t} = \frac{\hat{y}_t - y_t}{T}
$

The gradient with respect to the weights W is:

$
\nabla_W H(y, \hat{y}) = \frac{1}{T} \sum_{t=1}^{T} X_t^T (\hat{y}_t - y_t)
$

### Weight Update Rule

The weight update rule using gradient descent for softmax regression is:

$
W^{k+1} = W^k - \eta \nabla_W H(y, \hat{y})
$

where:
- $ \eta $ is the learning rate.
- $W^k $ is the weight vector at iteration k.
- $\nabla_W H(y, \hat{y}) $ is the gradient of the KL divergence (cross-entropy loss) with respect to the weights.

### RNN Output

For an RNN, its output vector $g(Wx_t)$ is directly used as the hypothetical function. Letting $ z = Wx_t $, the hypothetical function is:

$
h_W(x_t) = g(Wx_t)
$

### Softmax Function for RNN Output

The softmax function for the vector z is:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{i=1}^{k} e^{z_i}}
$

### Cross-Entropy Loss for RNN

The cross-entropy loss for a single training example in the context of RNNs is:

$
H(y_t, \hat{y}_t) = - \sum_{i=1}^{k} y_{t,i} \log(\hat{y}_{t,i})
$

### Weight Update Rule for RNN

The weight update rule using gradient descent for updating the weights W in an RNN is:

$
W^{k+1} = W^k - \eta \nabla_W H(y, \hat{y})
$

where:
- $\eta$  is the learning rate.
- $W^k $ is the weight matrix at iteration $ k $.
- $\nabla_W H(y, \hat{y}) $ is the gradient of the cross-entropy loss with respect to the weights.


In [3]:
import numpy as np

# Define the softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # stability improvement
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Avoid log(0)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Define the RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = np.random.randn(input_size, hidden_size) * 0.01
        self.Wh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Wo = np.random.randn(hidden_size, output_size) * 0.01
        self.bh = np.zeros((1, hidden_size))
        self.bo = np.zeros((1, output_size))
        
        # Initialize hidden state
        self.h = np.zeros((1, hidden_size))
    
    def forward(self, x):
        self.h = np.tanh(np.dot(x, self.Wx) + np.dot(self.h, self.Wh) + self.bh)
        o = np.dot(self.h, self.Wo) + self.bo
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = y_pred - y_true
        
        # Compute the gradients of Wo and bo
        dL_dWo = np.dot(self.h.T, dL_dO)
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = np.dot(dL_dO, self.Wo.T)
        dL_dWh = np.dot(self.h.T, dL_dh * (1 - self.h ** 2))
        dL_dWx = np.dot(x.T, dL_dh * (1 - self.h ** 2))
        dL_dbh = dL_dh * (1 - self.h ** 2)
        
        # Update the weights and biases
        self.Wo -= self.learning_rate * dL_dWo
        self.bo -= self.learning_rate * np.sum(dL_dbo, axis=0, keepdims=True)
        self.Wh -= self.learning_rate * dL_dWh
        self.Wx -= self.learning_rate * dL_dWx
        self.bh -= self.learning_rate * np.sum(dL_dbh, axis=0, keepdims=True)

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss/len(X_train)}')

# Example data
X_train = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y_train = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])

# Initialize RNN
input_size = X_train.shape[1]
hidden_size = 5
output_size = y_train.shape[1]
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        x = x.reshape(1, -1)
        y_pred.append(rnn.forward(x))
    return np.array(y_pred)

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)


Epoch 100/1000, Loss: 1.0435256411205036
Epoch 200/1000, Loss: 0.9840876611465466
Epoch 300/1000, Loss: 0.869546581208506
Epoch 400/1000, Loss: 0.7182150784529611
Epoch 500/1000, Loss: 0.6425357646396047
Epoch 600/1000, Loss: 0.5854458052724199
Epoch 700/1000, Loss: 0.5113694386531344
Epoch 800/1000, Loss: 0.4230903768958066
Epoch 900/1000, Loss: 0.3280583317952764
Epoch 1000/1000, Loss: 0.20328721850364254
Predictions: [[[0.7384881  0.15151101 0.11000089]]

 [[0.925965   0.07198433 0.00205067]]

 [[0.19359036 0.75806083 0.04834881]]

 [[0.00288275 0.05147516 0.94564209]]]


In [4]:
import math

# Define the softmax function
def softmax(x):
    exp_x = [math.exp(i) for i in x]
    sum_exp_x = sum(exp_x)
    return [i / sum_exp_x for i in exp_x]

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = [max(min(i, 1 - epsilon), epsilon) for i in y_pred]  # Avoid log(0)
    return -sum([y_true[i] * math.log(y_pred[i]) for i in range(len(y_true))])

# Matrix-vector multiplication
def mat_vec_mul(mat, vec):
    return [sum(mat[i][j] * vec[j] for j in range(len(vec))) for i in range(len(mat))]

# Vector addition
def vec_add(vec1, vec2):
    return [vec1[i] + vec2[i] for i in range(len(vec1))]

# Element-wise vector multiplication
def vec_mul(vec1, vec2):
    return [vec1[i] * vec2[i] for i in range(len(vec1))]

# Tanh activation function
def tanh(x):
    return [(math.exp(2 * i) - 1) / (math.exp(2 * i) + 1) for i in x]

# Derivative of tanh activation function
def tanh_derivative(x):
    return [1 - i**2 for i in x]

# Simple RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = [[0.01 for _ in range(hidden_size)] for _ in range(input_size)]
        self.Wh = [[0.01 for _ in range(hidden_size)] for _ in range(hidden_size)]
        self.Wo = [[0.01 for _ in range(output_size)] for _ in range(hidden_size)]
        self.bh = [0.0 for _ in range(hidden_size)]
        self.bo = [0.0 for _ in range(output_size)]
        
        # Initialize hidden state
        self.h = [0.0 for _ in range(hidden_size)]
    
    def forward(self, x):
        self.h = tanh(vec_add(mat_vec_mul(self.Wx, x), vec_add(mat_vec_mul(self.Wh, self.h), self.bh)))
        o = vec_add(mat_vec_mul(self.Wo, self.h), self.bo)
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = [y_pred[i] - y_true[i] for i in range(len(y_true))]
        
        # Compute the gradients of Wo and bo
        dL_dWo = [[self.h[i] * dL_dO[j] for j in range(len(dL_dO))] for i in range(len(self.h))]
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = mat_vec_mul(self.Wo, dL_dO)
        dL_dh = vec_mul(dL_dh, tanh_derivative(self.h))
        
        dL_dWh = [[self.h[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(self.h))]
        dL_dWx = [[x[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(x))]
        dL_dbh = dL_dh
        
        # Update the weights and biases
        self.Wo = [[self.Wo[i][j] - self.learning_rate * dL_dWo[i][j] for j in range(len(dL_dWo[0]))] for i in range(len(self.Wo))]
        self.bo = [self.bo[i] - self.learning_rate * dL_dbo[i] for i in range(len(self.bo))]
        self.Wh = [[self.Wh[i][j] - self.learning_rate * dL_dWh[i][j] for j in range(len(dL_dWh[0]))] for i in range(len(self.Wh))]
        self.Wx = [[self.Wx[i][j] - self.learning_rate * dL_dWx[i][j] for j in range(len(dL_dWx[0]))] for i in range(len(self.Wx))]
        self.bh = [self.bh[i] - self.learning_rate * dL_dbh[i] for i in range(len(self.bh))]

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss / len(X_train)}')

# Example data
X_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
y_train = [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Initialize RNN
input_size = len(X_train[0])
hidden_size = 5
output_size = len(y_train[0])
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        y_pred.append(rnn.forward(x))
    return y_pred

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)


IndexError: list index out of range

In [5]:
import math

# Define the softmax function
def softmax(x):
    exp_x = [math.exp(i) for i in x]
    sum_exp_x = sum(exp_x)
    return [i / sum_exp_x for i in exp_x]

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = [max(min(i, 1 - epsilon), epsilon) for i in y_pred]  # Avoid log(0)
    return -sum([y_true[i] * math.log(y_pred[i]) for i in range(len(y_true))])

# Matrix-vector multiplication
def mat_vec_mul(mat, vec):
    return [sum(mat[i][j] * vec[j] for j in range(len(vec))) for i in range(len(mat))]

# Vector addition
def vec_add(vec1, vec2):
    return [vec1[i] + vec2[i] for i in range(len(vec1))]

# Element-wise vector multiplication
def vec_mul(vec1, vec2):
    return [vec1[i] * vec2[i] for i in range(len(vec1))]

# Tanh activation function
def tanh(x):
    return [(math.exp(2 * i) - 1) / (math.exp(2 * i) + 1) for i in x]

# Derivative of tanh activation function
def tanh_derivative(x):
    return [1 - i**2 for i in x]

# Simple RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = [[0.01 for _ in range(hidden_size)] for _ in range(input_size)]
        self.Wh = [[0.01 for _ in range(hidden_size)] for _ in range(hidden_size)]
        self.Wo = [[0.01 for _ in range(output_size)] for _ in range(hidden_size)]
        self.bh = [0.0 for _ in range(hidden_size)]
        self.bo = [0.0 for _ in range(output_size)]
        
        # Initialize hidden state
        self.h = [0.0 for _ in range(hidden_size)]
    
    def forward(self, x):
        self.h = tanh(vec_add(mat_vec_mul(self.Wx, x), vec_add(mat_vec_mul(self.Wh, self.h), self.bh)))
        o = vec_add(mat_vec_mul(self.Wo, self.h), self.bo)
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = [y_pred[i] - y_true[i] for i in range(len(y_true))]
        
        # Compute the gradients of Wo and bo
        dL_dWo = [[self.h[i] * dL_dO[j] for j in range(len(dL_dO))] for i in range(len(self.h))]
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = mat_vec_mul(self.Wo, dL_dO)
        dL_dh = vec_mul(dL_dh, tanh_derivative(self.h))
        
        dL_dWh = [[self.h[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(self.h))]
        dL_dWx = [[x[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(x))]
        dL_dbh = dL_dh
        
        # Update the weights and biases
        self.Wo = [[self.Wo[i][j] - self.learning_rate * dL_dWo[i][j] for j in range(len(dL_dWo[0]))] for i in range(len(self.Wo))]
        self.bo = [self.bo[i] - self.learning_rate * dL_dbo[i] for i in range(len(self.bo))]
        self.Wh = [[self.Wh[i][j] - self.learning_rate * dL_dWh[i][j] for j in range(len(dL_dWh[0]))] for i in range(len(self.Wh))]
        self.Wx = [[self.Wx[i][j] - self.learning_rate * dL_dWx[i][j] for j in range(len(dL_dWx[0]))] for i in range(len(self.Wx))]
        self.bh = [self.bh[i] - self.learning_rate * dL_dbh[i] for i in range(len(self.bh))]

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss / len(X_train)}')

# Example data
X_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
y_train = [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Initialize RNN
input_size = len(X_train[0])
hidden_size = 5
output_size = len(y_train[0])
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        y_pred.append(rnn.forward(x))
    return y_pred

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)
# give a try

IndexError: list index out of range

# Recurrent Neural Network Equations

## Definitions and Variables
- $ \mathbf{x}_t \in \mathbb{R}^n $: Input vector to the hidden layer at time t.
- $ \mathbf{h}_t \in \mathbb{R}^l $: Output of hidden layer (hidden state) at time t.
- $ \hat{\mathbf{y}}_t \in \mathbb{R}^m $: Output produced by RNN at time t.
- $ \mathbf{y} \in \mathbb{R}^m $: Desired output.
- $\mathbf{U} \in \mathbb{R}^{l \times n} $: Input-hidden weight matrix.
- $ \mathbf{W} \in \mathbb{R}^{l \times l} $: Hidden-hidden weight matrix.
- $ \mathbf{V} \in \mathbb{R}^{m \times l} $: Hidden-output weight matrix.
- $\mathbf{b}_t \in \mathbb{R}^l $: Bias vectors (intercept terms) of hidden layers.
- $\mathbf{c}_t \in \mathbb{R}^m $: Bias vectors (intercept terms) of output layers.
- $f(\mathbf{h}) \in \mathbb{R} $: Elementwise activation function of hidden layer.
- $ g(\mathbf{y}) \in \mathbb{R} $: Elementwise activation function of output layer.

## Feedforward Network (FFN) Dynamics

### Hidden States for FFN:
$$
h_j(t) = f(\text{net}_j), \quad \text{for } j = 1, \ldots, l,
$$

$$
\text{net}_j = \sum_{i=1}^n U_{ji} x_i(t) + b_j(t), \quad \text{for } j = 1, \ldots, l.
$$

### Output Nodes for FFN:
$$
y_k(t) = g(\text{net}_k), \quad \text{for } k = 1, \ldots, m,
$$

$$
\text{net}_k = \sum_{j=1}^l V_{kj} h_j(t) + c_j(t), \quad \text{for } k = 1, \ldots, m.
$$

### FFN in Matrix-Vector Form:
$$
\mathbf{h}_t = f(\mathbf{U} \mathbf{x}_t + \mathbf{b}_t),
$$

$$
\mathbf{y}_t = g(\mathbf{V} \mathbf{h}_t + \mathbf{c}_t).
$$

## Recurrent Neural Network (RNN) Dynamics

### Hidden State Recurrence:
$$
h_j(t) = f(\text{net}_j), \quad \text{for } j = 1, \ldots, l,
$$

$$
\text{net}_j = \sum_{i=1}^n U_{ji} x_i(t) + \sum_{\tilde{j}=1}^l W_{j\tilde{j}} h_{\tilde{j}}(t-1) + b_j, \quad \text{for } j = 1, \ldots, l.
$$


# Recurrent Neural Network (RNN) Equations

## Feedforward Network (FFN) Dynamics

The dynamical system for FFNs can be written in matrix-vector form as:

$$
\mathbf{h}_t = f(\mathbf{U}_{hx} \mathbf{x}_t + \mathbf{b}_t),
$$

$$
\mathbf{y}_t = g(\mathbf{V}_{yh} \mathbf{h}_t + \mathbf{c}_t).
$$

## Recurrent Neural Network (RNN) Dynamics

An RNN is a neural network that simulates a discrete-time dynamical system with:
- An input vector $ \mathbf{x}_t = [x_1(t), \ldots, x_n(t)]^T $,
- An output vector $ \mathbf{y}_t = [y_1(t), \ldots, y_m(t)]^T $,
- A hidden state vector $ \mathbf{h}_t = [h_1(t), \ldots, h_l(t)]^T $,

where the subscript t represents the time step.

RNNs have two recurrence forms. The general structure of a three-layer RNN with hidden state recurrence can be described by the following dynamical system:

### Hidden States for Hidden State Recurrence

$$
h_j(t) = f(\text{net}_j), \quad \text{for } j = 1, \ldots, l,
$$

$$
\text{net}_j = \sum_{i=1}^n U_{ji} x_i(t) + \sum_{\tilde{j}=1}^l W_{j\tilde{j}} h_{\tilde{j}}(t - 1) + b_j, \quad \text{for } j = 1, \ldots, l.
$$

Where:
- $\mathbf{U}_{hx} $ is the input-hidden weight matrix,
- $ \mathbf{W}_{hh} $ is the hidden-hidden weight matrix,
- $ \mathbf{V}_{yh} $ is the hidden-output weight matrix,
- $ \mathbf{b}_t $ is the bias vector for the hidden layers,
- $ \mathbf{c}_t $ is the bias vector for the output layers,
- f  is the activation function for the hidden layer,
- g  is the activation function for the output layer.

These equations capture the essence of how inputs are processed through the hidden states and how the recurrent connections influence the hidden states across time steps.


# General Structure of a Regular Unidirectional Three-Layer RNN with Output Recurrence

## Description
The general structure of a regular unidirectional three-layer Recurrent Neural Network (RNN) with output recurrence can be visualized as having a delay line \( z^{-1} \). This structure can be depicted in two forms:
1. **RNN with a delay line** (left side).
2. **RNN unfolded in time** for two time steps (right side).

### RNN with Delay Line
In this structure, the RNN processes input sequences and has recurrent connections that influence the current hidden state and output based on the previous time step.

### RNN Unfolded in Time
When unfolded in time, the RNN's operations can be seen across multiple time steps, showing how the hidden states and outputs at each time step depend on the previous states and outputs.

## Equations

### Hidden States for Hidden State Recurrence
The hidden state \( h_j(t) \) at time \( t \) is calculated as:

$$
h_j(t) = f(\text{net}_j), \quad \text{for } j = 1, \ldots, l,
$$

where

$$
\text{net}_j = \sum_{i=1}^n U_{ji} x_i(t) + \sum_{\tilde{j}=1}^l W_{j\tilde{j}} h_{\tilde{j}}(t - 1) + b_j, \quad \text{for } j = 1, \ldots, l.
$$

### Output Nodes for Output Recurrence
The output \( y_k(t) \) at time \( t \) is calculated as:

$$
y_k(t) = g(\text{net}_k), \quad \text{for } k = 1, \ldots, m,
$$

where

$$
\text{net}_k = \sum_{j=1}^l V_{kj} h_j(t) + \sum_{\tilde{k}=1}^m W'_{k\tilde{k}} y_{\tilde{k}}(t - 1) + c_k, \quad \text{for } k = 1, \ldots, m.
$$

## Matrices and Vectors
- $ \mathbf{x}_t \in \mathbb{R}^n $: Input vector at time t.
- $ \mathbf{h}_t \in \mathbb{R}^l $: Hidden state vector at time  t .
- $ \mathbf{y}_t \in \mathbb{R}^m $: Output vector at time t.
- $ \mathbf{U} \in \mathbb{R}^{l \times n} $: Input-hidden weight matrix.
- $ \mathbf{W} \in \mathbb{R}^{l \times l} $: Hidden-hidden weight matrix.
- $ \mathbf{V} \in \mathbb{R}^{m \times l} $: Hidden-output weight matrix.
- $ \mathbf{W'} \in \mathbb{R}^{m \times m} $: Output-output weight matrix (for output recurrence).
- $ \mathbf{b}_t \in \mathbb{R}^l $: Bias vector for hidden layers.
- $ \mathbf{c}_t \in \mathbb{R}^m $: Bias vector for output layers.
-  f : Activation function for the hidden layer.
-  g : Activation function for the output layer.

### Explanation
1. **Hidden State Recurrence**: The hidden state at each time step t depends on the current input, the previous hidden state, and a bias term.
2. **Output Recurrence**: The output at each time step t depends on the current hidden state, the previous output, and a bias term.

These equations illustrate the recursive nature of RNNs, where the network's current state and output are influenced by previous states and outputs, enabling the modeling of temporal dependencies in sequential data.


In [1]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def tanh(x):
    return math.tanh(x)

def rnn_forward(X, U, W, V, W_prime, b, c, timesteps):
    n = len(X[0])  # Input dimension
    l = len(U)     # Number of hidden units
    m = len(V)     # Output dimension
    
    # Initialize hidden states and outputs
    h = [[0] * l for _ in range(timesteps)]
    y = [[0] * m for _ in range(timesteps)]
    
    for t in range(timesteps):
        for j in range(l):
            net_j = 0
            for i in range(n):
                net_j += U[j][i] * X[t][i]
            for j_tilde in range(l):
                if t > 0:
                    net_j += W[j][j_tilde] * h[t-1][j_tilde]
            net_j += b[j]
            h[t][j] = tanh(net_j)
        
        for k in range(m):
            net_k = 0
            for j in range(l):
                net_k += V[k][j] * h[t][j]
            for k_tilde in range(m):
                if t > 0:
                    net_k += W_prime[k][k_tilde] * y[t-1][k_tilde]
            net_k += c[k]
            y[t][k] = sigmoid(net_k)
    
    return h, y

# Example usage
timesteps = 2
n = 3  # Input dimension
l = 2  # Number of hidden units
m = 2  # Output dimension

# Example input for 2 timesteps
X = [
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6]
]

# Initialize weights and biases
U = [
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6]
]

W = [
    [0.7, 0.8],
    [0.9, 1.0]
]

V = [
    [0.1, 0.2],
    [0.3, 0.4]
]

W_prime = [
    [0.5, 0.6],
    [0.7, 0.8]
]

b = [0.1, 0.2]
c = [0.3, 0.4]

# Perform forward pass
hidden_states, outputs = rnn_forward(X, U, W, V, W_prime, b, c, timesteps)

print("Hidden States:", hidden_states)
print("Outputs:", outputs)


Hidden States: [[0.23549574953849794, 0.47770001216849795], [0.7473861026001217, 0.9301695100217859]]
Outputs: [[0.6032653732179408, 0.6596486166222596], [0.7787151218361145, 0.8750413532290525]]


### Backpropagation Through Time (BPTT)

The design goal of an RNN is to calculate the gradient of the error with respect to parameters U , V, and W, and then learn good parameters using stochastic gradient descent (SGD). By mimicking the sum of the errors, the gradients at each time step are also summed up for one training example to get:

$$
\frac{\partial E}{\partial W} = \sum_{t=1}^{T} \frac{\partial E_t}{\partial W}
$$

$$
\frac{\partial E}{\partial V} = \sum_{t=1}^{T} \frac{\partial E_t}{\partial V}
$$

$$
\frac{\partial E}{\partial U} = \sum_{t=1}^{T} \frac{\partial E_t}{\partial U}
$$

The backpropagation through time (BPTT) algorithm [159] plays an important role in designing RNNs. The basic idea of this algorithm is to apply the chain rule of differentiation to calculate the above gradients backwards starting from the error.

1. Calculate the gradient $ \frac{\partial E_t}{\partial V_{kj}(t)} $:

$$
\frac{\partial E_t}{\partial V_{kj}(t)} = \frac{\partial E_t}{\partial \hat{y}_{pk}(t)} \cdot \frac{\partial \hat{y}_{pk}(t)}{\partial \text{net}_{pk}(t)} \cdot \frac{\partial \text{net}_{pk}(t)}{\partial V_{kj}(t)}
$$

$$
= - (y_{pk}(t) - \hat{y}_{pk}(t)) \cdot g'(\hat{y}_{pk}(t)) \cdot h_{pj}(t)
$$

$$
= -\delta_{pk}(t) \cdot h_{pj}(t)
$$

where $ \delta_{pk}(t) $is called the error for output nodes at time step \( t \) and is defined as:

$$
\delta_{pk}(t) = -\frac{\partial E_t}{\partial \hat{y}_{pk}(t)} \cdot \frac{\partial \hat{y}_{pk}(t)}{\partial \text{net}_{pk}(t)} = (y_{pk}(t) - \hat{y}_{pk}(t)) \cdot g'(\hat{y}_{pk}(t))
$$

2. Calculate the gradient $ \frac{\partial E_t}{\partial W_{jk}(t)} $:

$$
\frac{\partial E_t}{\partial W_{jk}(t)} = \sum_{k=1}^{m} -\delta_{pj}(t) \cdot \hat{y}_{pk}(t - 1)
$$

where $ \delta_{pj}(t) $ is known as the error for hidden nodes given by:

$$
\delta_{pj}(t) = \sum_{k=1}^{m} \delta_{pk} \cdot V_{kj} \cdot f'(h_{pj})
$$

3. Calculate the gradient $ \frac{\partial E_t}{\partial U_{ji}(t)} $:

$$
\frac{\partial E_t}{\partial U_{ji}(t)} = \sum_{i=1}^{n} -\delta_{pj}(t) \cdot x_{pi}(t)
$$

The above discussion can be summarized into the following formulae:

$$
\delta_{pk}(t) = (y_{pk}(t) - \hat{y}_{pk}(t)) \cdot g'(\hat{y}_{pk}(t))
$$

$$
\delta_{pj}(t) = \sum_{k=1}^{m} \delta_{pk}(t) \cdot V_{jk}(t) \cdot f'(h_{pj}(t))
$$

$$
V_{kj}(t + 1) = V_{kj}(t) + \Delta V_{kj}(t) = V_{kj}(t) + \eta \sum_{p=1}^{P} \delta_{pk} \cdot h_{pj}(t)
$$

$$
W_{jk}(t + 1) = W_{jk}(t) + \Delta W_{jk}(t) = W_{jk}(t) + \eta \sum_{p=1}^{P} \delta_{pj} \cdot \hat{y}_{pk}(t - 1)
$$

$$
U_{ji}(t + 1) = U_{ji}(t) + \Delta U_{ji}(t) = U_{ji}(t) + \eta \sum_{p=1}^{P} \delta_{pj} \cdot x_{pi}(t)
$$


In [4]:
import numpy as np

# Activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of activation function
def sigmoid_derivative(x):
    return x * (1 - x)

class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Weight matrices
        self.U = np.random.randn(hidden_size, input_size)
        self.W = np.random.randn(hidden_size, hidden_size)
        self.V = np.random.randn(output_size, hidden_size)
        
        # Biases
        self.b_hidden = np.zeros((hidden_size, 1))
        self.b_output = np.zeros((output_size, 1))
        
    def forward(self, x):
        T = x.shape[1]
        h = np.zeros((self.hidden_size, T))
        y = np.zeros((self.output_size, T))
        
        for t in range(T):
            if t == 0:
                h[:, t] = sigmoid(np.dot(self.U, x[:, t].reshape(-1, 1)) + self.b_hidden)
            else:
                h[:, t] = sigmoid(np.dot(self.U, x[:, t].reshape(-1, 1)) + np.dot(self.W, h[:, t-1].reshape(-1, 1)) + self.b_hidden)
            
            y[:, t] = sigmoid(np.dot(self.V, h[:, t].reshape(-1, 1)) + self.b_output)
        
        return h, y
    
    def backward(self, x, y, h, lr):
        T = x.shape[1]
        dU = np.zeros_like(self.U)
        dW = np.zeros_like(self.W)
        dV = np.zeros_like(self.V)
        db_hidden = np.zeros_like(self.b_hidden)
        db_output = np.zeros_like(self.b_output)
        
        for t in range(T-1, -1, -1):
            delta_output = y[:, t] - x[:, t]
            dV += np.dot(delta_output, h[:, t].reshape(1, -1))
            db_output += delta_output.reshape(-1, 1)
            
            if t == 0:
                delta_hidden = np.dot(self.V.T, delta_output) * sigmoid_derivative(h[:, t])
            else:
                delta_hidden = np.dot(self.W.T, delta_hidden) * sigmoid_derivative(h[:, t])
                
            dU += np.dot(delta_hidden, x[:, t].reshape(1, -1))
            db_hidden += delta_hidden.reshape(-1, 1)
            
            if t > 0:
                dW += np.dot(delta_hidden, h[:, t-1].reshape(1, -1))
                
        # Update weights and biases
        self.U -= lr * dU
        self.W -= lr * dW
        self.V -= lr * dV
        self.b_hidden -= lr * db_hidden
        self.b_output -= lr * db_output

# Example usage
input_size = 100 # Assuming input matrix size is 100xT
hidden_size = 64
output_size = 100

# Create an RNN instance
rnn = RNN(input_size, hidden_size, output_size)

# Generate random input matrix (assuming it contains Sanskrit characters)
x = np.random.randn(input_size, 10) # Assuming 10 time steps

# Forward pass
h, y = rnn.forward(x)

# Backward pass (assuming y is the target output)
lr = 0.01
rnn.backward(x, y, h, lr)


ValueError: could not broadcast input array from shape (64,1) into shape (64,)

In [5]:
import numpy as np

# Define the Sanskrit to Odia mapping
sanskrit_to_odia = {
    'अ': 'ଅ',
    'आ': 'ଆ',
    'इ': 'ଇ',
    'ई': 'ଈ',
    'उ': 'ଉ',
    # Add more mappings as needed
}

# Define the RNN class
# class RNN:
    # Constructor and methods implementation ...

# Initialize the RNN model
input_size = 128  # Assuming ASCII representation
hidden_size = 64
output_size = 128  # Assuming ASCII representation
rnn = RNN(input_size, hidden_size, output_size)

# Prepare training data
def prepare_data(text, mapping):
    input_data = []
    target_data = []
    for char in text:
        # Convert Sanskrit character to ASCII code
        input_char = ord(char)
        input_data.append(input_char)
        
        # Convert Sanskrit character to Odia character
        if char in mapping:
            target_char = mapping[char]
        else:
            target_char = char
        # Convert Odia character to ASCII code
        target_data.append(ord(target_char))
    
    return np.array(input_data).reshape(-1, 1), np.array(target_data).reshape(-1, 1)

# Sample input text in Sanskrit
sanskrit_text = "अज जातः पर्णीरभ्यन्तरः शय्या अवलम्ब्य शयनम्।"

# Prepare training data
input_data, target_data = prepare_data(sanskrit_text, sanskrit_to_odia)

# Train the RNN model (assuming forward and backward methods are implemented)
lr = 0.01
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    hidden_states, predictions = rnn.forward(input_data)
    
    # Backward pass
    rnn.backward(input_data, target_data, hidden_states, lr)

# Once trained, you can use the model to map Sanskrit characters to Odia characters
# by passing the input through the network's forward method


ValueError: shapes (64,128) and (44,1) not aligned: 128 (dim 1) != 44 (dim 0)