In [73]:
import numpy as np
from scipy.stats import entropy
from scipy.sparse import issparse
from joblib import Parallel, delayed

def mutual_information_matrix_serial(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and only computes the upper triangular part of the matrix.
    """
    if not issparse(matrix):
        matrix = matrix.tocsr()

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(vi, vj, nbins=20):
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    for i in range(n_features):
        for j in range(i, n_features):
            vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
            vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
            mi_matrix[i, j] = compute_pairwise_mi(vi, vj, nbins=nbins)
            if i != j:
                mi_matrix[j, i] = mi_matrix[i, j]  # Exploit symmetry
    return mi_matrix


In [74]:
from scipy.sparse import random as sparse_random

# Generate a sparse random matrix with 1000 rows and 5000 columns
# Density of the matrix is set to 0.01 (1% non-zero elements)
sparse_matrix = sparse_random(1000, 5000, density=0.01, format='csr')

# Checking the matrix shape
sparse_matrix.shape



(1000, 5000)

In [75]:
#from scipy.sparse import random as sparse_random
#sparse_matrix = sparse_random(100, 1000, density=0.01, format='csr')  # Example sparse matrix
mi_matrix = mutual_information_matrix_serial(sparse_matrix, nbins=20, n_jobs=-1)
print(mi_matrix)


[[1.18864891e-01 1.65955874e-03 1.82754875e-03 ... 1.61457335e-03
  2.09605944e-03 1.12979878e-04]
 [1.65955874e-03 1.53868497e-01 1.31313465e-03 ... 1.49256285e-03
  2.08846836e-04 1.50906120e-04]
 [1.82754875e-03 1.31313465e-03 1.10537815e-01 ... 1.38830081e-04
  1.47057364e-04 1.06787749e-04]
 ...
 [1.61457335e-03 1.49256285e-03 1.38830081e-04 ... 1.20657407e-01
  1.52829678e-03 1.43565108e-03]
 [2.09605944e-03 2.08846836e-04 1.47057364e-04 ... 1.52829678e-03
  1.25980561e-01 1.22160248e-04]
 [1.12979878e-04 1.50906120e-04 1.06787749e-04 ... 1.43565108e-03
  1.22160248e-04 9.29633178e-02]]


In [76]:
from scipy.sparse import issparse
from scipy.stats import entropy
import numpy as np
from joblib import Parallel, delayed

def mutual_information_matrix_parallel(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and only computes the upper triangular part of the matrix.
    """
    if not issparse(matrix):
        matrix = matrix.tocsr()

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(i, j, matrix, nbins=20):
        """
        Computes mutual information between row i and row j of the sparse matrix.
        """
        vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
        vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
        
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    # Parallelizing the pairwise mutual information computation for the upper triangular matrix
    jobs = [(i, j) for i in range(n_features) for j in range(i + 1, n_features)]
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_pairwise_mi)(i, j, matrix, nbins) for i, j in jobs
    )

    # Fill the upper triangular matrix with the results
    for idx, (i, j) in enumerate(jobs):
        mi_matrix[i, j] = results[idx]
        mi_matrix[j, i] = results[idx]  # Exploit symmetry to avoid duplicate computation

    return mi_matrix


In [77]:
#from scipy.sparse import random as sparse_random
mi_matrix = mutual_information_matrix_parallel(sparse_matrix, nbins=20, n_jobs=-1)
print(mi_matrix)


[[0.         0.00165956 0.00182755 ... 0.00161457 0.00209606 0.00011298]
 [0.00165956 0.         0.00131313 ... 0.00149256 0.00020885 0.00015091]
 [0.00182755 0.00131313 0.         ... 0.00013883 0.00014706 0.00010679]
 ...
 [0.00161457 0.00149256 0.00013883 ... 0.         0.0015283  0.00143565]
 [0.00209606 0.00020885 0.00014706 ... 0.0015283  0.         0.00012216]
 [0.00011298 0.00015091 0.00010679 ... 0.00143565 0.00012216 0.        ]]


In [102]:
import mutual_information_serial as mis
# Test the function
mi_matrix = mis.mutual_information_matrix_cython(sparse_matrix, nbins=20)
print(mi_matrix)



ValueError: Buffer dtype mismatch, expected 'float' but got 'double'