In [None]:
######################################################################################
'''Copyright (c) 2004- 2024 , Prof. Radhamadhab Dalai Odisha, India
Author's email address :  rmdi115@gmail.com'''
###################################################################################

###Softmax Regression and Softmax Function
### Softmax Function
The softmax function is used to convert a vector of raw scores (logits) into probabilities. For a vector \( z \) of length \( K \), the softmax function is defined as:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{k=1}^{K} e^{z_k}}
$

where:
- $ \sigma(z)_j $ is the probability of the  j-th class.
- $ z_j $ is the j-th element of the input vector z.
- K is the number of classes.

### Cross-Entropy Loss for Multi-Class Classification
The cross-entropy loss for multi-class classification is defined as:

$
H(y, \hat{y}) = - \frac{1}{T} \sum_{t=1}^{T} \sum_{k=1}^{K} y_{t,k} \log(\hat{y}_{t,k})
$

where:
-  T is the total number of samples.
-  K is the number of classes.
-  $ y_{t,k} $ is a binary indicator (0 or 1) if class label k is the correct classification for sample t.
- $ \hat{y}_{t,k} $ is the predicted probability of class k for sample t.

### Gradient of the Cross-Entropy Loss Function
The gradient of the cross-entropy loss function with respect to the predicted probabilities $ \hat{y}_t $ is:

$
\frac{\partial H(y, \hat{y})}{\partial \hat{y}_t} = \frac{\hat{y}_t - y_t}{T}
$

The gradient with respect to the weights w is:

$
\nabla_w H(y, \hat{y}) = \frac{1}{T} \sum_{t=1}^{T} ( \hat{y}_t - y_t ) x_t
$

### Weight Update Rule
The weight update rule using gradient descent for softmax regression is:

$
w^{k+1} = w^k - \eta \nabla_w H(y, \hat{y})
$

where:
- $ \eta $ is the learning rate.
- $ w^k $ is the weight vector at iteration k.
- $ \nabla_w H(y, \hat{y}) $ is the gradient of the KL divergence (cross-entropy loss) with respect to the weights.




In [2]:
import numpy as np

# Softmax function
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # stability improvement
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Cross-entropy loss function for softmax regression
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Avoid log(0)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Gradient of the cross-entropy loss function
def gradient_cross_entropy_loss(X, y_true, y_pred):
    return np.dot(X.T, (y_pred - y_true)) / X.shape[0]

# Softmax regression training function
def softmax_regression_train(X, y, learning_rate, epochs):
    n_samples, n_features = X.shape
    n_classes = y.shape[1]
    weights = np.zeros((n_features, n_classes))
    bias = np.zeros(n_classes)

    for _ in range(epochs):
        # Linear combination of features and weights
        linear_model = np.dot(X, weights) + bias

        # Predictions using softmax function
        y_pred = softmax(linear_model)

        # Compute the gradient of the loss function
        gradient = gradient_cross_entropy_loss(X, y, y_pred)
        bias_gradient = np.mean(y_pred - y, axis=0)

        # Update weights and bias using gradient descent
        weights -= learning_rate * gradient
        bias -= learning_rate * bias_gradient

    return weights, bias

# Softmax regression prediction function
def softmax_regression_predict(X, weights, bias):
    linear_model = np.dot(X, weights) + bias
    return softmax(linear_model)

# Example data (one-hot encoded labels)
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])

# Training
learning_rate = 0.01
epochs = 1000
weights, bias = softmax_regression_train(X, y, learning_rate, epochs)

# Prediction
y_pred = softmax_regression_predict(X, weights, bias)
print("Predictions:", y_pred)

# Loss
loss = cross_entropy_loss(y, y_pred)
print("Cross-entropy loss:", loss)


Predictions: [[0.68065166 0.18928148 0.13006686]
 [0.49600597 0.26347298 0.24052105]
 [0.30814999 0.31266358 0.37918643]
 [0.1649971  0.3197845  0.5152184 ]]
Cross-entropy loss: 0.727915949036652


### Softmax Regression

Given a set of training samples {($x_1$, $y_1$), $\ldots$, ($x_T$, $y_T$)}, where $x_t \in \mathbb{R}^n$ and $y_t \in \{1, \ldots, k\} $, we design a weight matrix W:

$
W = [w_1, w_2, \ldots, w_k]
$

### Softmax Function

The softmax function converts a vector of raw scores (logits) into probabilities. For a vector z of length k, the softmax function is defined as:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{k=1}^{K} e^{z_k}}
$

where:
- $ \sigma(z)_j $ is the probability of the j-th class.
- $ z_j $ is the j-th element of the input vector z.
- k is the number of classes.

### Cross-Entropy Loss for Multi-Class Classification

The cross-entropy loss for multi-class classification is defined as:

$
H(y, \hat{y}) = - \frac{1}{T} \sum_{t=1}^{T} \sum_{i=1}^{k} y_{t,i} \log(\hat{y}_{t,i})
$

where:
- T  is the total number of samples.
- k  is the number of classes.
- y_{t,i}  is a binary indicator (0 or 1) if class label i is the correct classification for sample t .
- $\hat{y}_{t,i} $ is the predicted probability of class i for sample t.

### Gradient of the Cross-Entropy Loss Function

The gradient of the cross-entropy loss function with respect to the predicted probabilities $\hat{y}_t$ is:

$
\frac{\partial H(y, \hat{y})}{\partial \hat{y}_t} = \frac{\hat{y}_t - y_t}{T}
$

The gradient with respect to the weights W is:

$
\nabla_W H(y, \hat{y}) = \frac{1}{T} \sum_{t=1}^{T} X_t^T (\hat{y}_t - y_t)
$

### Weight Update Rule

The weight update rule using gradient descent for softmax regression is:

$
W^{k+1} = W^k - \eta \nabla_W H(y, \hat{y})
$

where:
- $ \eta $ is the learning rate.
- $W^k $ is the weight vector at iteration k.
- $\nabla_W H(y, \hat{y}) $ is the gradient of the KL divergence (cross-entropy loss) with respect to the weights.

### RNN Output

For an RNN, its output vector $g(Wx_t)$ is directly used as the hypothetical function. Letting $ z = Wx_t $, the hypothetical function is:

$
h_W(x_t) = g(Wx_t)
$

### Softmax Function for RNN Output

The softmax function for the vector z is:

$
\sigma(z)_j = \frac{e^{z_j}}{\sum_{i=1}^{k} e^{z_i}}
$

### Cross-Entropy Loss for RNN

The cross-entropy loss for a single training example in the context of RNNs is:

$
H(y_t, \hat{y}_t) = - \sum_{i=1}^{k} y_{t,i} \log(\hat{y}_{t,i})
$

### Weight Update Rule for RNN

The weight update rule using gradient descent for updating the weights W in an RNN is:

$
W^{k+1} = W^k - \eta \nabla_W H(y, \hat{y})
$

where:
- $\eta$  is the learning rate.
- $W^k $ is the weight matrix at iteration $ k $.
- $\nabla_W H(y, \hat{y}) $ is the gradient of the cross-entropy loss with respect to the weights.


In [3]:
import numpy as np

# Define the softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # stability improvement
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Avoid log(0)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Define the RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = np.random.randn(input_size, hidden_size) * 0.01
        self.Wh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Wo = np.random.randn(hidden_size, output_size) * 0.01
        self.bh = np.zeros((1, hidden_size))
        self.bo = np.zeros((1, output_size))
        
        # Initialize hidden state
        self.h = np.zeros((1, hidden_size))
    
    def forward(self, x):
        self.h = np.tanh(np.dot(x, self.Wx) + np.dot(self.h, self.Wh) + self.bh)
        o = np.dot(self.h, self.Wo) + self.bo
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = y_pred - y_true
        
        # Compute the gradients of Wo and bo
        dL_dWo = np.dot(self.h.T, dL_dO)
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = np.dot(dL_dO, self.Wo.T)
        dL_dWh = np.dot(self.h.T, dL_dh * (1 - self.h ** 2))
        dL_dWx = np.dot(x.T, dL_dh * (1 - self.h ** 2))
        dL_dbh = dL_dh * (1 - self.h ** 2)
        
        # Update the weights and biases
        self.Wo -= self.learning_rate * dL_dWo
        self.bo -= self.learning_rate * np.sum(dL_dbo, axis=0, keepdims=True)
        self.Wh -= self.learning_rate * dL_dWh
        self.Wx -= self.learning_rate * dL_dWx
        self.bh -= self.learning_rate * np.sum(dL_dbh, axis=0, keepdims=True)

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss/len(X_train)}')

# Example data
X_train = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y_train = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])

# Initialize RNN
input_size = X_train.shape[1]
hidden_size = 5
output_size = y_train.shape[1]
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        x = x.reshape(1, -1)
        y_pred.append(rnn.forward(x))
    return np.array(y_pred)

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)


Epoch 100/1000, Loss: 1.0435256411205036
Epoch 200/1000, Loss: 0.9840876611465466
Epoch 300/1000, Loss: 0.869546581208506
Epoch 400/1000, Loss: 0.7182150784529611
Epoch 500/1000, Loss: 0.6425357646396047
Epoch 600/1000, Loss: 0.5854458052724199
Epoch 700/1000, Loss: 0.5113694386531344
Epoch 800/1000, Loss: 0.4230903768958066
Epoch 900/1000, Loss: 0.3280583317952764
Epoch 1000/1000, Loss: 0.20328721850364254
Predictions: [[[0.7384881  0.15151101 0.11000089]]

 [[0.925965   0.07198433 0.00205067]]

 [[0.19359036 0.75806083 0.04834881]]

 [[0.00288275 0.05147516 0.94564209]]]


In [4]:
import math

# Define the softmax function
def softmax(x):
    exp_x = [math.exp(i) for i in x]
    sum_exp_x = sum(exp_x)
    return [i / sum_exp_x for i in exp_x]

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = [max(min(i, 1 - epsilon), epsilon) for i in y_pred]  # Avoid log(0)
    return -sum([y_true[i] * math.log(y_pred[i]) for i in range(len(y_true))])

# Matrix-vector multiplication
def mat_vec_mul(mat, vec):
    return [sum(mat[i][j] * vec[j] for j in range(len(vec))) for i in range(len(mat))]

# Vector addition
def vec_add(vec1, vec2):
    return [vec1[i] + vec2[i] for i in range(len(vec1))]

# Element-wise vector multiplication
def vec_mul(vec1, vec2):
    return [vec1[i] * vec2[i] for i in range(len(vec1))]

# Tanh activation function
def tanh(x):
    return [(math.exp(2 * i) - 1) / (math.exp(2 * i) + 1) for i in x]

# Derivative of tanh activation function
def tanh_derivative(x):
    return [1 - i**2 for i in x]

# Simple RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = [[0.01 for _ in range(hidden_size)] for _ in range(input_size)]
        self.Wh = [[0.01 for _ in range(hidden_size)] for _ in range(hidden_size)]
        self.Wo = [[0.01 for _ in range(output_size)] for _ in range(hidden_size)]
        self.bh = [0.0 for _ in range(hidden_size)]
        self.bo = [0.0 for _ in range(output_size)]
        
        # Initialize hidden state
        self.h = [0.0 for _ in range(hidden_size)]
    
    def forward(self, x):
        self.h = tanh(vec_add(mat_vec_mul(self.Wx, x), vec_add(mat_vec_mul(self.Wh, self.h), self.bh)))
        o = vec_add(mat_vec_mul(self.Wo, self.h), self.bo)
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = [y_pred[i] - y_true[i] for i in range(len(y_true))]
        
        # Compute the gradients of Wo and bo
        dL_dWo = [[self.h[i] * dL_dO[j] for j in range(len(dL_dO))] for i in range(len(self.h))]
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = mat_vec_mul(self.Wo, dL_dO)
        dL_dh = vec_mul(dL_dh, tanh_derivative(self.h))
        
        dL_dWh = [[self.h[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(self.h))]
        dL_dWx = [[x[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(x))]
        dL_dbh = dL_dh
        
        # Update the weights and biases
        self.Wo = [[self.Wo[i][j] - self.learning_rate * dL_dWo[i][j] for j in range(len(dL_dWo[0]))] for i in range(len(self.Wo))]
        self.bo = [self.bo[i] - self.learning_rate * dL_dbo[i] for i in range(len(self.bo))]
        self.Wh = [[self.Wh[i][j] - self.learning_rate * dL_dWh[i][j] for j in range(len(dL_dWh[0]))] for i in range(len(self.Wh))]
        self.Wx = [[self.Wx[i][j] - self.learning_rate * dL_dWx[i][j] for j in range(len(dL_dWx[0]))] for i in range(len(self.Wx))]
        self.bh = [self.bh[i] - self.learning_rate * dL_dbh[i] for i in range(len(self.bh))]

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss / len(X_train)}')

# Example data
X_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
y_train = [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Initialize RNN
input_size = len(X_train[0])
hidden_size = 5
output_size = len(y_train[0])
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        y_pred.append(rnn.forward(x))
    return y_pred

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)


IndexError: list index out of range

In [5]:
import math

# Define the softmax function
def softmax(x):
    exp_x = [math.exp(i) for i in x]
    sum_exp_x = sum(exp_x)
    return [i / sum_exp_x for i in exp_x]

# Define the cross-entropy loss function
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = [max(min(i, 1 - epsilon), epsilon) for i in y_pred]  # Avoid log(0)
    return -sum([y_true[i] * math.log(y_pred[i]) for i in range(len(y_true))])

# Matrix-vector multiplication
def mat_vec_mul(mat, vec):
    return [sum(mat[i][j] * vec[j] for j in range(len(vec))) for i in range(len(mat))]

# Vector addition
def vec_add(vec1, vec2):
    return [vec1[i] + vec2[i] for i in range(len(vec1))]

# Element-wise vector multiplication
def vec_mul(vec1, vec2):
    return [vec1[i] * vec2[i] for i in range(len(vec1))]

# Tanh activation function
def tanh(x):
    return [(math.exp(2 * i) - 1) / (math.exp(2 * i) + 1) for i in x]

# Derivative of tanh activation function
def tanh_derivative(x):
    return [1 - i**2 for i in x]

# Simple RNN cell
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights
        self.Wx = [[0.01 for _ in range(hidden_size)] for _ in range(input_size)]
        self.Wh = [[0.01 for _ in range(hidden_size)] for _ in range(hidden_size)]
        self.Wo = [[0.01 for _ in range(output_size)] for _ in range(hidden_size)]
        self.bh = [0.0 for _ in range(hidden_size)]
        self.bo = [0.0 for _ in range(output_size)]
        
        # Initialize hidden state
        self.h = [0.0 for _ in range(hidden_size)]
    
    def forward(self, x):
        self.h = tanh(vec_add(mat_vec_mul(self.Wx, x), vec_add(mat_vec_mul(self.Wh, self.h), self.bh)))
        o = vec_add(mat_vec_mul(self.Wo, self.h), self.bo)
        return softmax(o)
    
    def backward(self, x, y_true, y_pred):
        # Compute the gradient of the loss with respect to the output
        dL_dO = [y_pred[i] - y_true[i] for i in range(len(y_true))]
        
        # Compute the gradients of Wo and bo
        dL_dWo = [[self.h[i] * dL_dO[j] for j in range(len(dL_dO))] for i in range(len(self.h))]
        dL_dbo = dL_dO
        
        # Backpropagation through time (for simplicity, one time step)
        dL_dh = mat_vec_mul(self.Wo, dL_dO)
        dL_dh = vec_mul(dL_dh, tanh_derivative(self.h))
        
        dL_dWh = [[self.h[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(self.h))]
        dL_dWx = [[x[i] * dL_dh[j] for j in range(len(dL_dh))] for i in range(len(x))]
        dL_dbh = dL_dh
        
        # Update the weights and biases
        self.Wo = [[self.Wo[i][j] - self.learning_rate * dL_dWo[i][j] for j in range(len(dL_dWo[0]))] for i in range(len(self.Wo))]
        self.bo = [self.bo[i] - self.learning_rate * dL_dbo[i] for i in range(len(self.bo))]
        self.Wh = [[self.Wh[i][j] - self.learning_rate * dL_dWh[i][j] for j in range(len(dL_dWh[0]))] for i in range(len(self.Wh))]
        self.Wx = [[self.Wx[i][j] - self.learning_rate * dL_dWx[i][j] for j in range(len(dL_dWx[0]))] for i in range(len(self.Wx))]
        self.bh = [self.bh[i] - self.learning_rate * dL_dbh[i] for i in range(len(self.bh))]

# Training the RNN
def train_rnn(rnn, X_train, y_train, epochs=1000):
    for epoch in range(epochs):
        loss = 0
        for x, y in zip(X_train, y_train):
            y_pred = rnn.forward(x)
            loss += cross_entropy_loss(y, y_pred)
            rnn.backward(x, y, y_pred)
        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss / len(X_train)}')

# Example data
X_train = [[1, 2], [2, 3], [3, 4], [4, 5]]
y_train = [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Initialize RNN
input_size = len(X_train[0])
hidden_size = 5
output_size = len(y_train[0])
learning_rate = 0.01

rnn = SimpleRNN(input_size, hidden_size, output_size, learning_rate)

# Train RNN
train_rnn(rnn, X_train, y_train, epochs=1000)

# Test the RNN
def predict_rnn(rnn, X):
    y_pred = []
    for x in X:
        y_pred.append(rnn.forward(x))
    return y_pred

y_pred = predict_rnn(rnn, X_train)
print("Predictions:", y_pred)


IndexError: list index out of range