In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

Connectionist Temporal Classification (CTC)
CTC addresses the challenges of labeling unsegmented sequence data directly with RNNs. Here's a brief overview:

Given a set S of training examples where each example consists of a pair of sequences (x, z) with  x = ($x_1$, $\ldots$, $x_T$) and  z = ($z_1$, $\ldots$, $z_U$), where  U $\leq$ T , the sequences x and z are not aligned initially due to their differing lengths.

CTC Method:
Softmax Layer: A softmax layer is used to define an output distribution
𝑃(𝑘|𝑡)
at each time step t along the input sequence x. This distribution covers K phonemes plus an extra blank symbol  $\emptyset$, making the softmax layer size ( K + 1 ).

Bidirectional RNNs: Typically, RNNs trained with CTC are bidirectional to ensure each
𝑃(𝑘|𝑡)
depends on the entire input sequence x.

Output Vector: The unnormalized output vector
𝑦𝑡
at time step  t  is defined as:
𝑦𝑡=𝑊→𝑁ℎ𝑡→𝑁+𝑊←𝑁ℎ𝑡←𝑁+𝑏𝑦
where:

$ W_{\rightarrow N} $ and $ W_{\leftarrow N} $ are weight matrices from forward and backward hidden states to the output gate.
( h_{\rightarrow N}^t ) and ( h_{\leftarrow N}^t ) are forward and backward hidden states at time step ( t ).
( b_y ) is the bias.
Output Probability
𝑃(𝑘|𝑡)
:
𝑃(𝑘|𝑡)=exp(𝑦𝑡[𝑘])∑𝐾+1𝑘′=1exp(𝑦𝑡[𝑘′])
where ( y_t[k] ) is the ( k )-th element of ( y_t ).

S-LSTM Network
The S-LSTM network extends LSTM to handle longer-term dependencies and more complex input structures:

S-LSTM Memory Block:
Each S-LSTM memory block contains:

One input gate
𝑖𝑡
One output gate
𝑜𝑡
Multiple forget gates depending on the number of children of a node
Forward Computation:
Input Gate
𝑖𝑡
:
𝑖𝑡=𝜎(𝑊𝑖𝐿ℎℎ𝐿𝑡−1+𝑊𝑖𝑅ℎℎ𝑅𝑡−1+𝑊𝑖𝐿𝑐𝑐𝐿𝑡−1+𝑊𝑖𝑅𝑐𝑐𝑅𝑡−1+𝑏𝑖)
where:
( \sigma ) is the logistic sigmoid function.
( h_{Lt-1}, h_{Rt-1} ) are hidden vectors of the left and right children.
( c_{Lt-1}, c_{Rt-1} ) are cell vectors of the left and right children.
( W ) and ( b ) are weight matrices and biases.
These networks enhance the capabilities of traditional RNNs and LSTMs by addressing alignment issues and handling more complex input structures efficiently.

In [1]:
import numpy as np

class CTC:
    def __init__(self, input_size, num_classes):
        self.input_size = input_size
        self.num_classes = num_classes
        self.softmax = np.zeros((num_classes + 1, input_size))  # Including blank symbol

    def forward(self, x):
        # Assuming x is the input sequence tensor of shape (input_size, sequence_length)
        sequence_length = x.shape[1]
        outputs = np.zeros((self.num_classes + 1, sequence_length))

        for t in range(sequence_length):
            # Example softmax computation (replace with actual logits)
            logits_t = np.random.rand(self.num_classes + 1)
            outputs[:, t] = np.exp(logits_t) / np.sum(np.exp(logits_t))

        return outputs

# Example usage:
input_size = 10
num_classes = 5
ctc = CTC(input_size, num_classes)

# Example input sequence (10-dimensional input, length 20)
input_sequence = np.random.rand(input_size, 20)

# Forward pass
output_sequence = ctc.forward(input_sequence)
print(output_sequence.shape)  # Example output shape

(6, 20)


In [2]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class S_LSTM_Cell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weights = {
            'W_lh': np.random.randn(hidden_size, hidden_size),
            'W_rh': np.random.randn(hidden_size, hidden_size),
            'W_lc': np.random.randn(hidden_size, hidden_size),
            'W_rc': np.random.randn(hidden_size, hidden_size),
            'W_i': np.random.randn(hidden_size, input_size)
        }
        self.biases = {
            'b_i': np.zeros((hidden_size, 1))
        }

    def forward(self, h_l_prev, h_r_prev, c_l_prev, c_r_prev, x):
        i_t = sigmoid(
            np.dot(self.weights['W_lh'], h_l_prev) +
            np.dot(self.weights['W_rh'], h_r_prev) +
            np.dot(self.weights['W_lc'], c_l_prev) +
            np.dot(self.weights['W_rc'], c_r_prev) +
            np.dot(self.weights['W_i'], x) +
            self.biases['b_i']
        )
        return i_t

# Example usage:
input_size = 5  # Example input size
hidden_size = 10  # Example hidden size
s_lstm_cell = S_LSTM_Cell(input_size, hidden_size)

# Example inputs (previous hidden and cell states)
h_l_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
h_r_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
c_l_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
c_r_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
x = np.random.randn(input_size, 1)  # Shape (5, 1)

# Forward pass through S-LSTM cell
i_t = s_lstm_cell.forward(h_l_prev, h_r_prev, c_l_prev, c_r_prev, x)
print(i_t.shape)  # Example output shape


(10, 1)


Each S-LSTM memory block contains one input gate and one output gate, but different from LSTM, S-LSTM has two or more forget gates. The number of forget gates depends on the number of children of a node. For two children, their hidden vectors are denoted as \( h^L_{t-1} \) for the left child and \( h^R_{t-1} \) for the right child. These hidden vectors are taken in as the inputs of the current block.

The forward computation of a S-LSTM memory block is specified as follows:

1. **Input gate**: The input gate \( i_t \) contains four resources of information: the hidden vectors \( h^L_{t-1} \) and \( h^R_{t-1} \) and cell vectors \( c^L_{t-1} \) and \( c^R_{t-1} \) of its two children, i.e.,

$$
i_t = \sigma \left( W^L_{hi} h^L_{t-1} + W^R_{hi} h^R_{t-1} + W^L_{ci} c^L_{t-1} + W^R_{ci} c^R_{t-1} + b_i \right)
$$

where \( \sigma \) is the element-wise logistic function used to confine the gating signals to be in the range of [0, 1].


In [3]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

class S_LSTM_Cell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weights = {
            'W_lh': np.random.randn(hidden_size, hidden_size),
            'W_rh': np.random.randn(hidden_size, hidden_size),
            'W_lc': np.random.randn(hidden_size, hidden_size),
            'W_rc': np.random.randn(hidden_size, hidden_size),
            'W_i': np.random.randn(hidden_size, input_size)
        }
        self.biases = {
            'b_i': np.zeros((hidden_size, 1))
        }

    def forward(self, h_l_prev, h_r_prev, c_l_prev, c_r_prev, x):
        # Compute the input gate
        i_t = sigmoid(
            np.dot(self.weights['W_lh'], h_l_prev) +
            np.dot(self.weights['W_rh'], h_r_prev) +
            np.dot(self.weights['W_lc'], c_l_prev) +
            np.dot(self.weights['W_rc'], c_r_prev) +
            np.dot(self.weights['W_i'], x) +
            self.biases['b_i']
        )
        return i_t

    def backward(self, grad_output):
        # Placeholder for backward pass (gradient computation)
        pass

    def update_weights(self, lr):
        # Placeholder for weight update
        pass

# Example usage:
input_size = 5  # Example input size
hidden_size = 10  # Example hidden size
s_lstm_cell = S_LSTM_Cell(input_size, hidden_size)

# Generate synthetic dataset
num_samples = 100
X = np.random.randn(num_samples, input_size, 1)  # Inputs
y = np.random.randn(num_samples, hidden_size, 1)  # Targets

# Training parameters
num_epochs = 1000
learning_rate = 0.01

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(num_samples):
        # Example inputs (previous hidden and cell states)
        h_l_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
        h_r_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
        c_l_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
        c_r_prev = np.random.randn(hidden_size, 1)  # Shape (10, 1)
        x = X[i]  # Current input

        # Forward pass through S-LSTM cell
        i_t = s_lstm_cell.forward(h_l_prev, h_r_prev, c_l_prev, c_r_prev, x)

        # Compute loss (Mean Squared Error)
        loss = np.mean((i_t - y[i]) ** 2)
        total_loss += loss

        # Backward pass (compute gradients)
        grad_output = 2 * (i_t - y[i]) / y[i].size
        s_lstm_cell.backward(grad_output)

        # Update weights
        s_lstm_cell.update_weights(learning_rate)

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / num_samples}')

print('Training complete')


Epoch 0, Loss: 1.43207905678063
Epoch 100, Loss: 1.388094958975622
Epoch 200, Loss: 1.3739687215329668
Epoch 300, Loss: 1.4106310090744245
Epoch 400, Loss: 1.3708422935239144
Epoch 500, Loss: 1.379349831149973
Epoch 600, Loss: 1.3817452737316587
Epoch 700, Loss: 1.3979765672000388
Epoch 800, Loss: 1.3964823016505372
Epoch 900, Loss: 1.3550234442664653
Training complete


2. **Forget gate**: The above four sources of information are also used to form the gating signals for the left forget gate \( f^L_{t-1} \) and right forget gate \( f^R_{t-1} \) via different weight matrices:

$$
f^L_t = \sigma \left( W^L_{hfl} h^L_{t-1} + W^R_{hfl} h^R_{t-1} + W^L_{cfl} c^L_{t-1} + W^R_{cfl} c^R_{t-1} + b_{fl} \right)
$$

$$
f^R_t = \sigma \left( W^L_{hfr} h^L_{t-1} + W^R_{hfr} h^R_{t-1} + W^L_{cfr} c^L_{t-1} + W^R_{cfr} c^R_{t-1} + b_{fr} \right)
$$

3. **Cell gate**: The cell here considers the copies from both children’s cell vectors \( (c^L_{t-1}, c^R_{t-1}) \), gated with separated forget gates. The left and right forget gates can be controlled independently, allowing the pass-through of information from children’s cell vectors:

$$
x_t = W^L_{hx} h^L_{t-1} + W^R_{hx} h^R_{t-1} + b_x
$$

$$
c_t = f^L_t \odot c^L_{t-1} + f^R_t \odot c^R_{t-1} + i_t \odot \tanh(x_t)
$$

4. **Output gate**: The output gate \( o_t \) considers the hidden vectors from the children and the current cell vector:

$$
o_t = \sigma \left( W^L_{ho} h^L_{t-1} + W^R_{ho} h^R_{t-1} + W_{co} c_t + b_o \right)
$$

5. **Hidden state**: The hidden vector \( h_t \) and the cell vector \( c_t \) of the current block are passed to the parent and are used depending on if the current block is a left or right child of its parent:

$$
h_t = o_t \odot \tanh(c_t)
$$

The backward computation of a S-LSTM memory block uses backpropagation over structures:

$$
\frac{\partial o_t}{\partial h_t} = \frac{\partial h_t}{\partial o_t}
$$

$$
\frac{\partial x_t}{\partial o_t} = \frac{\partial h_t}{\partial o_t} \odot \tanh(c_t) \sigma' (o_t)
$$

$$
\frac{\partial f^L_t}{\partial x_t} = \frac{\partial c_t}{\partial x_t} \odot c^L_{t-1} \sigma' (f^L_t)
$$


In [4]:
import numpy as np

class S_LSTM_Cell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.weights = {
            'W_lh': np.random.randn(hidden_size, hidden_size),
            'W_rh': np.random.randn(hidden_size, hidden_size),
            'W_lc': np.random.randn(hidden_size, hidden_size),
            'W_rc': np.random.randn(hidden_size, hidden_size),
            'W_hi': np.random.randn(hidden_size, hidden_size * 2),  # for h_l and h_r concatenated
            'W_ci': np.random.randn(hidden_size, hidden_size * 2),  # for c_l and c_r concatenated
            'W_ho': np.random.randn(hidden_size, hidden_size * 2),  # for h_l and h_r concatenated
            'W_co': np.random.randn(hidden_size, hidden_size),
            'W_hfl': np.random.randn(hidden_size, hidden_size * 2),  # for h_l and h_r concatenated
            'W_cfl': np.random.randn(hidden_size, hidden_size * 2),  # for c_l and c_r concatenated
            'W_hfr': np.random.randn(hidden_size, hidden_size * 2),  # for h_l and h_r concatenated
            'W_cfr': np.random.randn(hidden_size, hidden_size * 2)   # for c_l and c_r concatenated
        }
        
        self.biases = {
            'b_i': np.zeros((hidden_size, 1)),
            'b_o': np.zeros((hidden_size, 1)),
            'b_fl': np.zeros((hidden_size, 1)),
            'b_fr': np.zeros((hidden_size, 1))
        }

    def forward(self, h_l_prev, h_r_prev, c_l_prev, c_r_prev):
        # Concatenate left and right hidden states and cell states
        h_concat = np.concatenate((h_l_prev, h_r_prev), axis=0)
        c_concat = np.concatenate((c_l_prev, c_r_prev), axis=0)
        
        # Input gate
        i_t = sigmoid(
            np.dot(self.weights['W_hi'], h_concat) +
            np.dot(self.weights['W_ci'], c_concat) +
            self.biases['b_i']
        )
        
        # Forget gates
        f_l_t = sigmoid(
            np.dot(self.weights['W_hfl'], h_concat) +
            np.dot(self.weights['W_cfl'], c_concat) +
            self.biases['b_fl']
        )
        
        f_r_t = sigmoid(
            np.dot(self.weights['W_hfr'], h_concat) +
            np.dot(self.weights['W_cfr'], c_concat) +
            self.biases['b_fr']
        )
        
        # Cell gate
        x_t = np.dot(self.weights['W_lh'], h_l_prev) + np.dot(self.weights['W_rh'], h_r_prev)
        c_t = f_l_t * c_l_prev + f_r_t * c_r_prev + i_t * np.tanh(x_t)
        
        # Output gate
        o_t = sigmoid(
            np.dot(self.weights['W_ho'], h_concat) +
            np.dot(self.weights['W_co'], c_t) +
            self.biases['b_o']
        )
        
        # Hidden state
        h_t = o_t * np.tanh(c_t)
        
        return h_t, c_t

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Example usage:
input_size = 5  # Example input size
hidden_size = 10  # Example hidden size

s_lstm_cell = S_LSTM_Cell(input_size, hidden_size)

# Example inputs (previous hidden and cell states)
h_l_prev = np.random.randn(hidden_size, 1)
h_r_prev = np.random.randn(hidden_size, 1)
c_l_prev = np.random.randn(hidden_size, 1)
c_r_prev = np.random.randn(hidden_size, 1)

# Forward pass through S-LSTM cell
h_t, c_t = s_lstm_cell.forward(h_l_prev, h_r_prev, c_l_prev, c_r_prev)
print("Hidden state:", h_t)
print("Cell state:", c_t)


Hidden state: [[ 7.32646942e-02]
 [ 2.93306077e-01]
 [-1.18412585e-02]
 [ 1.91031134e-01]
 [-1.36843695e-03]
 [-8.30990722e-01]
 [ 6.37431207e-05]
 [ 1.47682563e-02]
 [-9.44697639e-01]
 [ 1.52057216e-02]]
Cell state: [[ 0.52161983]
 [ 0.76274031]
 [-0.01184249]
 [ 2.66960687]
 [-0.29943821]
 [-1.19132949]
 [ 1.36305189]
 [ 0.44871409]
 [-3.04636773]
 [ 1.40694624]]
