In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

# Extreme Learning Machine (ELM)

Extreme learning machine (ELM) is a machine learning algorithm specifically designed for single-hidden-layer feedforward neural networks (SLFNs). This algorithm is notable for its ability to randomly select hidden nodes and analytically determine the output weights of SLFNs, resulting in fast training speeds and good generalization performance. Here, we'll dive into the key concepts and implementation of ELM.

## Key Concepts of Extreme Learning Machine (ELM)

### Single-Hidden-Layer Feedforward Neural Networks (SLFNs)
SLFNs are a type of neural network architecture with one hidden layer between the input and output layers. The hidden layer transforms the input data through nonlinear activation functions, enabling the network to learn complex patterns.

### Random Hidden Nodes
In ELM, the hidden nodes' parameters (weights and biases) are randomly generated and fixed. This randomization simplifies the learning process by reducing it to a linear problem, which can be solved analytically.

### Analytical Determination of Output Weights
The output weights are computed using a closed-form solution. Given the fixed hidden node parameters, the output weights can be determined by minimizing the error between the network's predictions and the actual target values using a least-squares solution.

## Mathematical Formulation

Given a training set \( \{(x_i, y_i) | x_i \in \mathbb{R}^n, y_i \in \mathbb{R}^m, i = 1, \ldots, N\} \), where \( x_i \) is the input and \( y_i \) is the target output, an SLFN with \( N \) hidden nodes can be represented as:

$$
h(x) = \sum_{i=1}^N \beta_i g(w_i \cdot x + b_i)
$$

where \( \beta_i \) is the output weight, \( w_i \) and \( b_i \) are the randomly generated parameters of the hidden node, and \( g(\cdot) \) is the activation function.

The objective is to find the output weights \( \beta \) that minimize the error:

$$
\min_{\beta} \| H \beta - Y \|
$$

where \( H \) is the hidden layer output matrix, and \( Y \) is the target matrix. The matrix \( H \) is constructed as:

$$
H = 
\begin{bmatrix}
g(w_1 \cdot x_1 + b_1) & \cdots & g(w_N \cdot x_1 + b_N) \\
\vdots & \ddots & \vdots \\
g(w_1 \cdot x_N + b_1) & \cdots & g(w_N \cdot x_N + b_N)
\end{bmatrix}
$$

The output weights \( \beta \) can be computed analytically using the Moore-Penrose generalized inverse of \( H \):

$$
\beta = H^\dagger Y
$$

## Implementation in Python

Here is a simple implementation of the Extreme Learning Machine (ELM) in Python:


In [1]:
import numpy as np

class ExtremeLearningMachine:
    def __init__(self, input_dim, hidden_dim, activation='sigmoid'):
        """
        Initialize the Extreme Learning Machine.

        Parameters:
        - input_dim: The dimension of the input features.
        - hidden_dim: The number of hidden neurons.
        - activation: The activation function to use ('sigmoid', 'tanh', 'relu').
        """
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.activation = activation
        
        # Randomly initialize weights and biases
        self.W = np.random.randn(hidden_dim, input_dim)
        self.b = np.random.randn(hidden_dim, 1)
    
    def _activation(self, X):
        """
        Apply activation function.

        Parameters:
        - X: The input data.

        Returns:
        - Activated output.
        """
        if self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-X))
        elif self.activation == 'tanh':
            return np.tanh(X)
        elif self.activation == 'relu':
            return np.maximum(0, X)
        else:
            raise ValueError("Unsupported activation function")

    def fit(self, X, Y):
        """
        Train the ELM model.

        Parameters:
        - X: The input features (shape: [num_samples, input_dim]).
        - Y: The target values (shape: [num_samples, output_dim]).
        """
        # Compute hidden layer output matrix H
        H = self._activation(np.dot(X, self.W.T) + self.b.T)
        
        # Compute output weights using the Moore-Penrose generalized inverse
        H_pseudo_inv = np.linalg.pinv(H)
        self.beta = np.dot(H_pseudo_inv, Y)
    
    def predict(self, X):
        """
        Predict the output for given input data.

        Parameters:
        - X: The input features (shape: [num_samples, input_dim]).

        Returns:
        - Predicted output values.
        """
        H = self._activation(np.dot(X, self.W.T) + self.b.T)
        return np.dot(H, self.beta)

# Example usage
if __name__ == "__main__":
    # Generate synthetic data
    np.random.seed(0)
    num_samples = 100
    input_dim = 10
    hidden_dim = 20
    output_dim = 1

    X_train = np.random.rand(num_samples, input_dim)
    Y_train = np.sin(np.sum(X_train, axis=1, keepdims=True))  # Example target: sin(sum of inputs)

    # Create and train ELM
    elm = ExtremeLearningMachine(input_dim=input_dim, hidden_dim=hidden_dim, activation='relu')
    elm.fit(X_train, Y_train)

    # Predict
    X_test = np.random.rand(10, input_dim)
    Y_pred = elm.predict(X_test)

    print("Predictions:\n", Y_pred)


Predictions:
 [[-0.95906357]
 [ 0.10913444]
 [ 0.34502531]
 [-0.63861083]
 [-0.61727189]
 [-0.75399938]
 [-0.02450225]
 [-0.6646245 ]
 [-0.72546154]
 [-0.5249342 ]]


### Random Hidden Nodes and SLFNs

**Definition 7.7 (Piecewise Continuous):** A function is said to be piecewise continuous if it has only a finite number of discontinuities in any interval and its left and right limits are defined (not necessarily equal) at each discontinuity.

**Definition 7.8 (Randomly Generated):** The function sequence \(\{g_n = g(w_n^T x + b_n)\}\) or \(\{g_n = g(\|w_n - x\| / b_n)\}\) is said to be randomly generated if the corresponding parameters are randomly generated from or based on a continuous sampling distribution probability.

**Definition 7.9 (Random Node):** A node is called a random node if its parameters \((w, b)\) are randomly generated based on a continuous sampling distribution probability.

### SLFN Architectures

Single-Layer Feedforward Networks (SLFNs) have two main network architectures:
- SLFNs with additive hidden nodes
- SLFNs with Radial Basis Function (RBF) networks, which apply RBF nodes in the hidden layer.

The network function of an SLFN with \(d\) hidden nodes can be represented by:

$$
f_d(x) = \sum_{i=1}^{d} \beta_i g_i(x), \quad x \in \mathbb{R}^n, \; \beta_i \in \mathbb{R}
$$

where \(g_i\) denotes the \(i\)-th hidden node output function.

Two commonly used \(i\)-th hidden node output functions \(g_i\) are defined as:

1. For additive nodes:
   $$
   g_i(x) = g(w_i^T x + b_i), \quad w_i \in \mathbb{R}^n, \; b_i \in \mathbb{R}
   $$

2. For RBF nodes:
   $$
   g_i(x) = g\left(\frac{\|x - b_i\|}{a_i}\right), \quad a_i \in \mathbb{R}^n, \; b_i \in \mathbb{R}^+
   $$

where \(a_i\) and \(b_i\) are the center and impact factor of the \(i\)-th RBF node, respectively, and \(R^+\) indicates the set of all positive real values.

In other words, the output function of an SLFN with \(d\) additive nodes and \(d\) RBF nodes can be, respectively, given by:

1. For additive nodes:
   $$
   f_d(x) = \sum_{i=1}^{d} \beta_i g\left(w_i^T x + b_i\right) \in \mathbb{R}
   $$

2. For RBF nodes:
   $$
   f_d(x) = \sum_{i=1}^{d} \beta_i g\left(\frac{\|x - b_i\|}{a_i}\right), \quad a_i \in \mathbb{R}^n \in \mathbb{R}
   $$

### Theorems

**Theorem 7.2 ([70]):** Suppose we are given a standard SLFN with \(N\) hidden nodes and activation function \(g: \mathbb{R} \to \mathbb{R}\) which is infinitely differentiable in any interval. For \(N\) arbitrary distinct samples \((x_i, y_i)\), \(i = 1, \ldots, N\), where \(x_i \in \mathbb{R}^n\) and \(y_i \in \mathbb{R}^m\), then for any \(w_i\) and \(b_i\) randomly chosen, respectively, from any intervals of \(\mathbb{R}^n\) and \(\mathbb{R}\) according to any continuous probability distribution, the hidden layer output matrix \(H\) of the SLFN, with probability one, is invertible and \(\|HB - Y\| = 0\).

**Theorem 7.3 ([70]):** If we are given any small positive value \(\epsilon > 0\) and activation function \(g: \mathbb{R} \to \mathbb{R}\) which is infinitely differentiable in any interval, and there exists \(d \leq N\) such that for \(N\) arbitrary distinct samples \(x_1, \ldots, x_N\) with \(x_i \in \mathbb{R}^n\) and \(y_i \in \mathbb{R}^m\), for any \(w_i\) and \(b_i\) randomly chosen, respectively, from any intervals of \(\mathbb{R}^n\) and \(\mathbb{R}\) according to any continuous probability distribution, then with probability one \(\|H_{N \times d} B_{d \times m} - Y_{N \times m}\| < \epsilon\).

Infinitely differentiable activation functions include the sigmoidal functions as well as the radial basis, sine, cosine, exponential, and many other nonregular functions, as shown by Huang and Babri [69].


In [2]:
import numpy as np

class SLFNAdditive:
    def __init__(self, input_dim, hidden_nodes):
        self.input_dim = input_dim
        self.hidden_nodes = hidden_nodes
        self.beta = None
        self.W = None
        self.b = None

    def _activation_function(self, x):
        # Sigmoid activation function
        return 1 / (1 + np.exp(-x))

    def fit(self, X, y):
        # Randomly initialize weights and biases
        self.W = np.random.randn(self.hidden_nodes, self.input_dim)
        self.b = np.random.randn(self.hidden_nodes)
        
        # Compute hidden layer output matrix H
        H = self._activation_function(X @ self.W.T + self.b)
        
        # Compute output weights beta
        self.beta = np.linalg.pinv(H) @ y

    def predict(self, X):
        H = self._activation_function(X @ self.W.T + self.b)
        return H @ self.beta

# Example usage
if __name__ == "__main__":
    # Create synthetic data
    np.random.seed(0)
    X = np.random.rand(100, 5)
    y = np.random.rand(100, 1)
    
    model = SLFNAdditive(input_dim=5, hidden_nodes=10)
    model.fit(X, y)
    predictions = model.predict(X)
    print(predictions)


[[0.3320863 ]
 [0.56467043]
 [0.61434423]
 [0.86048069]
 [0.62249718]
 [0.43570646]
 [0.45177932]
 [0.54242468]
 [0.49766405]
 [0.53969698]
 [0.4911353 ]
 [0.38984049]
 [0.84046841]
 [0.50098494]
 [0.3539762 ]
 [0.57933475]
 [0.42608073]
 [0.49432112]
 [0.41588136]
 [0.50363753]
 [0.49464266]
 [0.39507703]
 [0.26448625]
 [0.53011184]
 [0.37992645]
 [0.48581851]
 [0.44260452]
 [0.37835892]
 [0.4876137 ]
 [0.5558591 ]
 [0.42615518]
 [0.52496465]
 [0.54480534]
 [0.53305071]
 [0.55871666]
 [0.43328693]
 [0.51952578]
 [0.42249565]
 [0.55568979]
 [0.56492972]
 [0.44802283]
 [0.5937965 ]
 [0.76035292]
 [0.46304483]
 [0.5056868 ]
 [0.39306417]
 [0.75910231]
 [0.41467363]
 [0.67028621]
 [0.37284126]
 [0.52823741]
 [0.44631063]
 [0.55532268]
 [0.62182567]
 [0.56997007]
 [0.32093247]
 [0.28885814]
 [0.40313601]
 [0.69415569]
 [0.78958184]
 [0.63326574]
 [0.55571037]
 [0.61334342]
 [0.57922728]
 [0.50387058]
 [0.28639762]
 [0.50246076]
 [0.4734542 ]
 [0.60367878]
 [0.62678803]
 [0.4952507 ]
 [0.50

In [3]:
import numpy as np

class SLFNRBF:
    def __init__(self, input_dim, hidden_nodes):
        self.input_dim = input_dim
        self.hidden_nodes = hidden_nodes
        self.beta = None
        self.centers = None
        self.sigma = None

    def _rbf_function(self, X, centers, sigma):
        # Radial Basis Function (RBF) Gaussian Kernel
        return np.exp(-np.linalg.norm(X[:, np.newaxis] - centers, axis=2) ** 2 / (2 * sigma ** 2))

    def fit(self, X, y):
        # Randomly initialize centers and sigma
        self.centers = X[np.random.choice(X.shape[0], self.hidden_nodes, replace=False)]
        self.sigma = np.mean(np.linalg.norm(X[:, np.newaxis] - self.centers, axis=2))
        
        # Compute hidden layer output matrix H
        H = self._rbf_function(X, self.centers, self.sigma)
        
        # Compute output weights beta
        self.beta = np.linalg.pinv(H) @ y

    def predict(self, X):
        H = self._rbf_function(X, self.centers, self.sigma)
        return H @ self.beta

# Example usage
if __name__ == "__main__":
    # Create synthetic data
    np.random.seed(0)
    X = np.random.rand(100, 5)
    y = np.random.rand(100, 1)
    
    model = SLFNRBF(input_dim=5, hidden_nodes=10)
    model.fit(X, y)
    predictions = model.predict(X)
    print(predictions)


[[0.54733402]
 [0.56880901]
 [0.54205627]
 [0.57327753]
 [0.4468781 ]
 [0.56994786]
 [0.46543569]
 [0.52951433]
 [0.47680798]
 [0.6886485 ]
 [0.52394805]
 [0.45211762]
 [0.44195841]
 [0.40600525]
 [0.29348904]
 [0.37435315]
 [0.53798543]
 [0.51448145]
 [0.48200389]
 [0.37903319]
 [0.56579035]
 [0.42729645]
 [0.4134629 ]
 [0.56745583]
 [0.53069523]
 [0.63515795]
 [0.60664738]
 [0.5832641 ]
 [0.36590251]
 [0.38090445]
 [0.36955207]
 [0.53714864]
 [0.43674267]
 [0.63265826]
 [0.59630768]
 [0.54618567]
 [0.46356991]
 [0.5953703 ]
 [0.52944968]
 [0.62476833]
 [0.44399545]
 [0.53607809]
 [0.74766782]
 [0.50029257]
 [0.44934862]
 [0.28730394]
 [0.48350463]
 [0.58831402]
 [0.57628978]
 [0.52093793]
 [0.56855986]
 [0.58275491]
 [0.48551603]
 [0.62234231]
 [0.3439165 ]
 [0.20985124]
 [0.36293891]
 [0.44487683]
 [0.53068413]
 [0.6024756 ]
 [0.49205961]
 [0.44346988]
 [0.6559669 ]
 [0.6412266 ]
 [0.52984572]
 [0.51889914]
 [0.44464269]
 [0.66143614]
 [0.52972259]
 [0.54077514]
 [0.63735679]
 [0.54