In [34]:
import numpy as np
from scipy.stats import norm

# Set random seed for reproducibility
np.random.seed(37)

# Generate sample data
N = 1000
x1 = np.random.normal(1, 1, N)
x2 = np.random.normal(1 + 3.5 * x1, 1, N)  # x2 dependent on x1
x3 = np.random.normal(2, 1, N)
x4 = np.random.normal(3.8 - 2.5 * x3, 1, N)  # x4 dependent on x3

# Combine data into a single array
data = np.vstack([x1, x2, x3, x4]).T

# Calculate statistical properties of the dataset
means = data.mean(axis=0)
cov = np.cov(data.T)  # Covariance matrix
std = np.sqrt(np.diag(cov))  # Standard deviations
cor = np.corrcoef(data.T)  # Correlation matrix

# Display key statistical properties
print('Means:', means)
print('Covariance Matrix:\n', cov)
print('Standard Deviations:', std)
print('Correlation Matrix:\n', cor)

# Function to partition means into two subsets
def partition_means(index_1, means, index_2=None):
    index_2 = [i for i in range(len(means)) if i not in index_1] if index_2 is None else index_2
    m_1, m_2 = means[index_1], means[index_2]
    return m_1, m_2

# Function to partition covariance matrix into blocks
def partition_cov(index_1, cov, index_2=None):
    index_2 = [i for i in range(cov.shape[1]) if i not in index_1] if index_2 is None else index_2
    s_11 = cov[index_1][:, index_1]  # Covariance of subset 1
    s_12 = cov[index_1][:, index_2]  # Cross-covariance between subset 1 and 2
    s_21 = cov[index_2][:, index_1]  # Cross-covariance between subset 2 and 1
    s_22 = cov[index_2][:, index_2]  # Covariance of subset 2
    return s_11, s_12, s_21, np.linalg.inv(s_22)

# Function to partition data vector x into two subsets
def partition_x(index_1, x, index_2=None):
    index_2 = [i for i in range(len(x)) if i not in index_1] if index_2 is None else index_2
    x_1 = x[index_1]
    x_2 = x[index_2]
    return x_1, x_2

# Function to compute the log probability for the conditional distribution
def get_log_proba(index_1, data, means, cov, index_2=None, zero=1e-6):
    """
    Calculates the log-probability of x_a given x_b based on the conditional 
    multivariate normal distribution: P(x_a | x_b).
    
    :param index_1: Indices corresponding to x_a (subset 1)
    :param data: Dataset (N samples x M variables)
    :param means: Mean vector of the full dataset
    :param cov: Covariance matrix of the full dataset
    :param index_2: Optional indices corresponding to x_b (subset 2)
    :param zero: Small threshold for log computation to avoid log(0)
    :return: Sum of log-probabilities for all data points and the regression coefficients
    """
    # Partition means and covariance matrix
    m_1, m_2 = partition_means(index_1, means, index_2)
    s_11, s_12, s_21, s_22 = partition_cov(index_1, cov, index_2)
    
    # Conditional variance and inverse covariance
    conditional_variance = (s_11 - s_12.dot(s_22).dot(s_21))[0, 0]

    log_proba = []
    for x in data:
        # Partition the data vector into x_a and x_b
        x_1, x_2 = partition_x(index_1, x, index_2)
        
        # Conditional mean of x_a given x_b
        conditional_mean = (m_1 + s_12.dot(s_22).dot((x_2 - m_2).T))[0]
        
        # Compute normal PDF for x_a given the conditional distribution
        p = norm.pdf(x_1, loc=conditional_mean, scale=np.sqrt(conditional_variance))
        log_p = np.log(p) if p >= zero else 0.0  # Avoid log(0)
        log_proba.append(log_p)

    return sum(log_proba), s_12.dot(s_22)[0]

# Example usage for conditional probability: P(x2 | x1)
index_1 = [1]  # Index of x2
log_probability, regression_coefficients = get_log_proba(index_1, data, means, cov)

print('Log-probability:', log_probability)
print('Regression Coefficients:', regression_coefficients)


Means: [ 1.01277839  4.52863965  1.98990172 -1.17391554]
Covariance Matrix:
 [[ 9.63461496e-01  3.36840170e+00 -1.12846545e-02 -5.12464592e-02]
 [ 3.36840170e+00  1.27550651e+01 -9.26050108e-02 -8.56265759e-02]
 [-1.12846545e-02 -9.26050108e-02  9.70507183e-01 -2.46328945e+00]
 [-5.12464592e-02 -8.56265759e-02 -2.46328945e+00  7.25484316e+00]]
Standard Deviations: [0.98156075 3.5714234  0.98514323 2.69348161]
Correlation Matrix:
 [[ 1.          0.9608716  -0.01167002 -0.01938352]
 [ 0.9608716   1.         -0.02632048 -0.0089013 ]
 [-0.01167002 -0.02632048  1.         -0.9283293 ]
 [-0.01938352 -0.0089013  -0.9283293   1.        ]]
Log-probability: [-1405.27079403]
Regression Coefficients: [ 3.49205536 -0.16036643 -0.04158602]
