In [None]:
'''
 * Copyright (c) 2018 Radhamadhab Dalai
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
'''

## Bayesian Classification Theory

Bayes' rule is a fundamental theorem in probability used for classification:

$$
p(c_k | x) = \frac{p(x | c_k) \cdot p(c_k)}{p(x)}
$$

where:

$$
p(x) = \sum_{i=1}^{M} p(x | c_i) \cdot p(c_i)
$$

Here, \( M \) is the number of classes or groups, \( x = [x_1, \ldots, x_d]^T \) is a vector of feature values, and \( p(c_k | x) \) is the conditional probability that a given vector \( x \) belongs to the class \( c_k \). It is assumed that all possible events fall into exactly one of \( M \) classes or groups \( \{c_1, \ldots, c_M\} \).

### Naive Bayesian Classification

In Naive Bayesian classification, the goal is to assign the feature vector \( x \) to the class \( c_k \) with the highest conditional probability \( p(c_k | x) \). To estimate \( p(c_k | x) \), we need to estimate \( p(x | c_k) \), \( p(c_k) \), and \( p(x) \). Estimating \( p(x | c_k) \) is challenging due to the vector \( x \) containing \( d \) components \( x_1, \ldots, x_d \). 

A common strategy is to assume that the distribution of \( x \) conditional on \( c_k \) can be decomposed as:

$$
p(x | c_k) = \prod_{i=1}^{d} p(x_i | c_k)
$$

This assumption implies that the occurrence of a particular value of \( x_i \) is statistically independent of the occurrence of any other \( x_j \), \( j \neq i \), given the class \( c_k \). 

Under this assumption, Bayes' rule simplifies to:

$$
p(c_k | x) = \frac{p(c_k) \cdot \prod_{i=1}^{d} p(x_i | c_k)}{p(x)}
$$

where:

$$
p(x) = \sum_{l=1}^{M} p(c_l) \cdot \prod_{i=1}^{d} p(x_i | c_l)
$$

Thus, the estimation formula for the posterior probability is:

$$
\hat{p}(c_k | x) = \frac{\hat{p}(c_k) \cdot \prod_{i=1}^{d} \hat{p}(x_i | c_k)}{\hat{p}(x)}
$$

This estimate can then be used for classification.

### Bayesian Classification Theory

Bayesian decision theory provides a fundamental probability model for classification procedures. Consider an \( M \)-group classification problem where each object has an associated attribute vector of dimension \( d \). Let \( x \in \mathbb{R}^d \) be an attribute vector, and \( w_j \) denote the membership variable that takes the value 1 if an object belongs to group \( j \). Define \( p(\omega_j) \) as the prior probability of group \( j \) and \( f(x | \omega_j) \) as the probability density function.

According to Bayes' rule:

$$
p(\omega_j | x) = \frac{f(x | \omega_j) \cdot p(\omega_j)}{f(x)}
$$

where \( p(\omega_j | x) \) is the posterior probability of group \( j \) and \( f(x) \) is the probability density function:

$$
f(x) = \sum_{j=1}^{M} f(x | \omega_j) \cdot p(\omega_j)
$$

If an object with feature vector \( x \) is observed and classified into group \( j \), the probability of classification error is given by:

$$
p(\text{Error} | x) = 1 - p(\omega_j | x)
$$


In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train the Naive Bayes classifier
model = GaussianNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.98


In [2]:
# Custom Bayesian Classification Implementation
import numpy as np

def calculate_probabilities(X, y):
    classes = np.unique(y)
    probs = {}
    for cls in classes:
        X_cls = X[y == cls]
        probs[cls] = {
            'prior': len(X_cls) / len(X),
            'mean': np.mean(X_cls, axis=0),
            'var': np.var(X_cls, axis=0)
        }
    return probs

def gaussian_pdf(x, mean, var):
    coef = 1 / np.sqrt(2 * np.pi * var)
    exponent = np.exp(- (x - mean) ** 2 / (2 * var))
    return coef * exponent

def predict(X, probs):
    predictions = []
    for x in X:
        class_probs = {}
        for cls, values in probs.items():
            prior = values['prior']
            mean = values['mean']
            var = values['var']
            likelihood = np.prod(gaussian_pdf(x, mean, var))
            class_probs[cls] = prior * likelihood
        predictions.append(max(class_probs, key=class_probs.get))
    return np.array(predictions)

# Example usage:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Convert to binary classification for simplicity
y = (y == 0).astype(int)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calculate probabilities from training data
probs = calculate_probabilities(X_train, y_train)

# Predict on the test set
y_pred = predict(X_test, probs)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')



Accuracy: 1.00


The purpose of Bayesian classification rule is to minimize the probability of total
classification error (misclassification rate):

$$
\text{Decide } \omega_k \text{ if } p(\omega_k | x) = \max\{p(\omega_1|x), \ldots, p(\omega_M|x)\}.
\tag{7.7.9}
$$

Let \( c_{ij} \) be the cost of misclassifying to group \( i \) when it actually belongs to group \( j \). The expected cost associated with assigning to group \( i \) is

$$
C_i(x) = \sum_{j=1}^{M} c_{ij} p(\omega_j | x), \quad i = 1, \ldots, M.
\tag{7.7.10}
$$

\( C_i \) is also known as the conditional risk function. The optimal Bayesian decision
rule that minimizes the overall expected cost can be represented as

$$
\text{Decide } \omega_k \text{ for } x \text{ if } C_k(x) = \min\{C_1(x), \ldots, C_M(x)\}.
\tag{7.7.11}
$$

For binary classification with the two classes of \( \omega_1 \) and \( \omega_2 \), we should assign to class +1 if

$$
c_{12}(x) p(\omega_2) f(x|\omega_2) < c_{21}(x) p(\omega_1) f(x|\omega_1)
\tag{7.7.12}
$$

or

$$
\frac{f(x|\omega_1)}{f(x|\omega_2)} > \frac{c_{21}(x) p(\omega_2)}{c_{12}(x) p(\omega_1)}
\tag{7.7.13}
$$

Otherwise, we should assign to class −1.

At this point, we discuss the relationship between Bayes classification and neural
network classification. To this end, let \( C \) be a random variable which specifies the
class membership, i.e., \( C = c_k \) denotes membership in the \( k \)th class. The goal of
classification is then to determine \( C \) given \( x \in X \). To minimize the probability of
classification error, the optimal decision rule corresponds to choosing the class \( c_k \)
which maximizes the posterior probability \( p(c_k | x) \).

Define \( y = [y_1, \ldots, y_m]^T \in \mathbb{R}^M \), where \( M \) is the number of classes. Let \( e_k = [0, \ldots, 0, 1, 0, \ldots, 0]^T \in \mathbb{R}^M \) be the \( M \times 1 \) basic vector whose \( k \)th element equals 1 and others equal zero. Hence, if \( y = e_k \) then \( x \) belongs to class \( k \). This implies that \( y \) and \( c_k \) relate the same information if and only if \( y = e_k \). The advantage of this representation in [151] is: the \( k \)th component of the least squares estimate \( \hat{y} = E\{y|x\} \) can be written as

$$
\hat{y}_k = E\{y_k | x\} = \sum y_k p(y_k | x) = p(y_k = 1 | x) = p(c_k | x).
$$


In [3]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Example dataset
# X is the feature matrix with shape (n_samples, n_features)
# y is the label vector with shape (n_samples,)
# Here we generate a simple dataset for demonstration purposes
np.random.seed(42)
X = np.random.rand(100, 4)  # 100 samples, 4 features
y = np.random.randint(0, 2, 100)  # 100 samples, binary classification (0 or 1)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier on the training data
gnb.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = gnb.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Predict the class of a new sample
new_sample = np.array([[0.5, 0.5, 0.5, 0.5]])
predicted_class = gnb.predict(new_sample)
print(f'Predicted class for the new sample: {predicted_class[0]}')


Accuracy: 53.33%
Predicted class for the new sample: 0


# Sparse Bayesian Learning
### Sparse Bayesian Learning

Suppose we are given a sample of \( N \) “training” pairs \(\{x_n , t_n \}_{n=1}^N \), where \( t = [t_1 , \ldots , t_N ]^T \) is a target vector expressed as the sum of an approximation vector \( y = [y(x_1) , \ldots , y(x_N )]^T \) and an “error” vector \(\epsilon = [\epsilon_1 , \ldots , \epsilon_N ]^T \):

$$ t = y + \epsilon = \Phi w + \epsilon $$

where \( w \) is the parameter vector and \( \Phi = [\phi_1 , \ldots , \phi_M ] \) is the \( N \times M \) “design” matrix whose columns comprise the complete set of \( M \) “basis vectors.”

In the sparse Bayesian framework, the errors are assumed to be modeled probabilistically as independent zero-mean Gaussians, with variance \(\sigma^2\), i.e.,

$$ p(\epsilon) = \prod_{n=1}^N \mathcal{N}(\epsilon_n | 0, \sigma^2) $$

The above error model implies a multivariate Gaussian likelihood for the target vector \( t \):

$$ p(t|w, \sigma^2) = (2\pi)^{-\frac{N}{2}} \sigma^{-N} \exp \left( -\frac{1}{2\sigma^2} \|t - y\|^2 \right) $$

By [146], given hyperparameters \(\alpha = [\alpha_1, \ldots, \alpha_M]^T\), the posterior parameter distribution conditioned on the data is given by combining the likelihood and prior within Bayes’ rule:

$$ p(w|t, \alpha, \sigma^2) = \frac{p(t|w, \sigma^2) p(w|\alpha)}{p(t|\alpha, \sigma^2)} $$

and is Gaussian \(\mathcal{N}(\mu, \Sigma)\) with

$$ \Sigma = (A + \sigma^{-2} \Phi^T \Phi)^{-1} $$

$$ \mu = \sigma^{-2} \Sigma \Phi^T t $$

where \( A = \text{Diag}(\alpha_1 , \ldots , \alpha_M) \) is an \( M \times M \) diagonal matrix.

To find the solution vector \(\alpha = \alpha_{MP}\), Tipping et al. [146] proposed to use sparse Bayesian learning for formulating the (local) maximization with respect to \(\alpha\) of its logarithm

$$ L(\alpha) = \log p(t|\alpha, \sigma^2) = \log \int p(t|w, \sigma^2) p(w|\alpha) \, dw $$

$$ = -\frac{N}{2} \log(2\pi) + \log |C| + t^T C^{-1} t $$

with

$$ C = \sigma^2 I + \Phi A^{-1} \Phi^T $$

Considering the dependence of \( L(\alpha) \) on a single hyperparameter \(\alpha_i\), \( i \in \{1, \ldots, M\}\), then \( C \) in the above equation can be decomposed as

$$ C = \sigma^2 I + \sum_{m \neq i} \alpha_m^{-1} \phi_m \phi_m^T + \alpha_i^{-1} \phi_i \phi_i^T $$

$$ = C_{-i} + \alpha_i^{-1} \phi_i \phi_i^T $$

where \( C_{-i} \) is \( C \) with the contribution of basis vector \(i\) removed.

Then, the matrix determinant \(|C|\) and the inverse \(C^{-1}\) in the loss \(L(\alpha)\) can be obtained as [146]:

$$ |C| = |C_{-i}| \cdot \left( 1 + \alpha_i^{-1} \phi_i^T C_{-i}^{-1} \phi_i \right) $$

and

$$ C^{-1} = C_{-i}^{-1} - \frac{C_{-i}^{-1} \phi_i \phi_i^T C_{-i}^{-1}}{\alpha_i^{-1} + \phi_i^T C_{-i}^{-1} \phi_i} $$

Hence, we have

$$ L(\alpha) = -\frac{N}{2} \log(2\pi) + \log |C_{-i}| + t^T C_{-i}^{-1} t - \log \alpha_i + \log(\alpha_i + s_i) - \frac{q_i^2}{\alpha_i + s_i} $$

where

$$ s_i = \phi_i^T C_{-i}^{-1} \phi_i $$

$$ q_i = \phi_i^T C_{-i}^{-1} t $$

- Sparsity factor \( s_i \) can be seen to be a measure of the extent that basis vector \(\phi_i\) “overlaps” those already present in the model.
- Quality factor \( q_i \) can be written as \( q_i = \sigma^{-2} \phi_i^T (t - y_{-i}) \), and is thus a measure of the alignment of \(\phi_i\) with the error of the model with that vector excluded.

Analysis of sparse Bayesian learning [30] shows that \( L(\alpha) \) has a unique maximum with respect to \(\alpha_i\):

$$ \alpha_i = \begin{cases} 
\frac{s_i^2}{q_i^2 - s_i}, & \text{if } q_i^2 > s_i \\
\infty, & \text{if } q_i^2 \leq s_i 
\end{cases} $$

This result implies the following:
- If \(\phi_i\) is “in the model” (i.e., \(\alpha_i < \infty\)) yet \(q_i^2 \leq s_i\), then \(\phi_i\) may be deleted (i.e., set \(\alpha_i = \infty\)).
- If \(\phi_i\) is excluded from the model (i.e., \(\alpha_i = \infty\)) and \(q_i^2 > s_i\), \(\phi_i\) may be added (i.e., set \(\alpha_i = \frac{s_i}{q_i^2 - s_i}\)).

Through the above analysis, Tipping et al. [146] proposed the following sequential sparse Bayesian learning algorithm:

1. If regression initialize \(\sigma^2\) to some sensible value (e.g., \(\sigma^2 = \text{var}(t) \times 0.1\)).
2. Initialize with a single basis vector \(\phi_i\), setting

$$ \alpha_i = \frac{\| \phi_i \|^2}{(\phi_i^T t)^2 / (\| \phi_i \|^2 \sigma^2) - 1} $$

All other \(\alpha_m\) are notionally set to infinity.

3. Explicitly compute \(\Sigma\) and \(\mu\) (which are scalars initially), along with initial values of \(s_m\) and \(q_m\) for all \(M\) bases \(\phi_m\).

4. Recompute/update \(\Sigma\), \(\mu\):

$$ \Sigma = (\Phi^T B \Phi + A)^{-1} $$

$$ \mu = \Sigma \Phi^T B t $$

where \( A = \text{Diag}(\alpha_1 , \ldots , \alpha_M) \) and \( B = \sigma^{-2}I \).

5. Select a candidate basis vector \(\phi_i\) from the set of all \(M\).

6. Compute \(\theta_i = q_i^2 - s_i\).

7. If \(\theta_i > 0\) and \(\alpha_i < \infty\) (i.e., \(\phi_i\) is in the model), re-estimate \(\alpha_i\).

8. If \(\theta_i > 0\) and \(\alpha_i = \infty\), add \(\phi_i\) to the model with updated \(\alpha_i\).

9. If \(\theta_i \leq 0\) and \(\alpha_i < \infty\), then delete \(\phi_i\) from the model and set \(\alpha_i = \infty\).

10. In regression and estimating the noise level, update \(\sigma = \frac{\| t - y \|^2}{N - M + \sum_m \alpha_m \Sigma_{mm}}\), where \( y \approx \Phi \mu_{MP} \) and \(\Sigma_{mm}\) is the \((m, m)\)-th diagonal element.

11. Compute

$$ S_m = \phi_m^T B \phi_m - \phi_m^T B \Phi \Sigma \Phi
$$


In [4]:
import numpy as np

class SparseBayesianLearning:
    def __init__(self, sigma2=1.0):
        self.sigma2 = sigma2
        self.alpha = None
        self.mu = None
        self.Sigma = None

    def fit(self, Phi, t, max_iter=500, tol=1e-6):
        N, M = Phi.shape
        self.alpha = np.full(M, np.inf)
        self.alpha[0] = 1.0
        self.mu = np.zeros(M)
        
        for _ in range(max_iter):
            # Compute Sigma and mu
            A_inv = np.diag(1 / self.alpha)
            S_inv = np.linalg.inv(Phi.T @ Phi / self.sigma2 + A_inv)
            self.Sigma = self.sigma2 * S_inv
            self.mu = self.Sigma @ (Phi.T @ t / self.sigma2)
            
            # Update alpha
            gamma = 1 - self.alpha * np.diag(self.Sigma)
            self.alpha = gamma / self.mu ** 2

            # Prune small alpha values
            keep = self.alpha < 1e9
            if np.sum(keep) == 0:
                break
            self.alpha = self.alpha[keep]
            Phi = Phi[:, keep]
            self.mu = self.mu[keep]

            # Convergence check
            if np.max(np.abs(gamma - self.alpha * self.mu ** 2)) < tol:
                break

        return self

    def predict(self, Phi):
        return Phi @ self.mu

# Example usage
if __name__ == "__main__":
    # Generate some synthetic data
    N, M = 100, 10
    np.random.seed(0)
    X = np.random.randn(N, M)
    w_true = np.random.randn(M)
    y = X @ w_true + np.random.randn(N) * 0.1

    # Fit SBL model
    sbl = SparseBayesianLearning(sigma2=0.1)
    sbl.fit(X, y)
    y_pred = sbl.predict(X)

    print("True weights:", w_true)
    print("Estimated weights:", sbl.mu)
    print("Predicted values:", y_pred)




ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 10)

In [5]:
import numpy as np

class SparseBayesianLearning:
    def __init__(self, sigma2=1.0):
        self.sigma2 = sigma2
        self.alpha = None
        self.mu = None
        self.Sigma = None

    def fit(self, Phi, t, max_iter=500, tol=1e-6):
        N, M = Phi.shape
        self.alpha = np.full(M, np.inf)
        self.alpha[0] = 1.0
        self.mu = np.zeros(M)

        # Initializing the active set to contain the first basis vector
        active_set = [0]

        for _ in range(max_iter):
            # Extract active basis functions
            Phi_active = Phi[:, active_set]
            A_inv = np.diag(1 / self.alpha[active_set])
            S_inv = np.linalg.inv(Phi_active.T @ Phi_active / self.sigma2 + A_inv)
            self.Sigma = self.sigma2 * S_inv
            self.mu = self.Sigma @ (Phi_active.T @ t / self.sigma2)
            
            # Update alpha
            gamma = 1 - np.diag(self.Sigma) * self.alpha[active_set]
            self.alpha[active_set] = gamma / self.mu ** 2

            # Prune small alpha values
            keep = self.alpha[active_set] < 1e9
            if np.sum(keep) == 0:
                break
            active_set = [i for i, k in zip(active_set, keep) if k]
            self.alpha = np.full(M, np.inf)
            self.alpha[active_set] = gamma / self.mu ** 2

            # Convergence check
            if np.max(np.abs(gamma - self.alpha[active_set] * self.mu ** 2)) < tol:
                break

        return self

    def predict(self, Phi):
        # Extract active basis functions for prediction
        active_set = np.where(self.alpha < 1e9)[0]
        Phi_active = Phi[:, active_set]
        return Phi_active @ self.mu

# Example usage
if __name__ == "__main__":
    # Generate some synthetic data
    N, M = 100, 10
    np.random.seed(0)
    X = np.random.randn(N, M)
    w_true = np.random.randn(M)
    y = X @ w_true + np.random.randn(N) * 0.1

    # Fit SBL model
    sbl = SparseBayesianLearning(sigma2=0.1)
    sbl.fit(X, y)
    y_pred = sbl.predict(X)

    print("True weights:", w_true)
    print("Estimated weights:", sbl.mu)
    print("Predicted values:", y_pred)


True weights: [ 0.55596268  0.89247389 -0.42231482  0.10471403  0.22805333  0.20147995
  0.54077359 -1.81807763 -0.04932407  0.2390336 ]
Estimated weights: [0.04594617]
Predicted values: [ 0.08105144  0.00661825 -0.1173001   0.00711924 -0.04817699 -0.04114326
 -0.03089698  0.03349892 -0.05353417 -0.01852444  0.08652356  0.08580717
  0.01729531 -0.03537469 -0.06851757 -0.00313544 -0.02288268  0.04230993
 -0.01626466 -0.05940182 -0.01696249  0.04181923 -0.04392201 -0.06543014
 -0.02928779  0.02394093 -0.05462353  0.01833465 -0.00507892 -0.02885822
 -0.0600299  -0.00614307 -0.03306306  0.03433044  0.04909391 -0.03510948
  0.03209143  0.01288522 -0.07802145  0.02606481 -0.02750585 -0.06646272
  0.0427072  -0.02144976 -0.01815926 -0.04845613 -0.02377802 -0.09264615
  0.03544315 -0.04732158  0.01758509  0.04113378 -0.01022788 -0.10290122
  0.00980859  0.00753189  0.09240953  0.02084954 -0.08967792  0.0249631
 -0.07123628 -0.08005459  0.03108078 -0.03157876 -0.00415271  0.01318216
 -0.0428558

In [6]:
import numpy as np

def sbl(X, t, max_iter=100, tol=1e-6):
    """
    Sparse Bayesian Learning (SBL) for regression.
    
    Parameters:
    X : numpy array of shape (N, M)
        The design matrix where N is the number of samples and M is the number of features (basis functions).
    t : numpy array of shape (N,)
        The target vector.
    max_iter : int
        The maximum number of iterations.
    tol : float
        The convergence tolerance.
    
    Returns:
    mu : numpy array of shape (M,)
        The estimated weights.
    alpha : numpy array of shape (M,)
        The hyperparameters.
    sigma2 : float
        The noise variance.
    """
    
    N, M = X.shape
    alpha = np.ones(M) * 1e-6
    sigma2 = np.var(t) * 0.1

    for iteration in range(max_iter):
        # Compute A, Sigma, and mu
        A = np.diag(alpha)
        Sigma_inv = A + (1 / sigma2) * np.dot(X.T, X)
        Sigma = np.linalg.inv(Sigma_inv)
        mu = (1 / sigma2) * np.dot(Sigma, np.dot(X.T, t))

        # Compute gamma
        gamma = 1 - alpha * np.diag(Sigma)

        # Update alpha and sigma2
        alpha_new = gamma / (mu ** 2)
        sigma2_new = np.sum((t - np.dot(X, mu)) ** 2) / (N - np.sum(gamma))

        # Check for convergence
        if np.max(np.abs(alpha_new - alpha)) < tol and np.abs(sigma2_new - sigma2) < tol:
            break

        alpha = alpha_new
        sigma2 = sigma2_new

    return mu, alpha, sigma2

# Example usage
np.random.seed(42)
N = 100
M = 10
X = np.random.randn(N, M)
w_true = np.random.randn(M)
t = np.dot(X, w_true) + np.random.randn(N) * 0.1

mu, alpha, sigma2 = sbl(X, t)

print(f'Estimated weights:\n{mu}')
print(f'Hyperparameters:\n{alpha}')
print(f'Noise variance:\n{sigma2}')


Estimated weights:
[ 1.3990157   0.92274868  0.05144812 -0.64339411  0.70301962  0.38327377
  0.89031089  0.65747169  1.07162577 -0.53273178]
Hyperparameters:
[  0.51088519   1.17427923 363.42015907   2.41502917   2.02289376
   6.80180289   1.26137757   2.31264318   0.8706797    3.52206255]
Noise variance:
0.010677310972015039
