In [4]:
import numpy as np
from scipy.stats import entropy
from scipy.sparse import issparse
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix

def mutual_information_matrix_serial(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and only computes the upper triangular part of the matrix.
    """
    if not issparse(matrix):
        matrix = csr_matrix(matrix)

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(vi, vj, nbins=20):
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    for i in range(n_features):
        for j in range(i, n_features):
            vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
            vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
            mi_matrix[i, j] = compute_pairwise_mi(vi, vj, nbins=nbins)
            if i != j:
                mi_matrix[j, i] = mi_matrix[i, j]  # Exploit symmetry
    return mi_matrix


In [5]:
from scipy.sparse import random as sparse_random
from scipy.io import mmwrite

# Generate a sparse random matrix with 1000 rows and 5000 columns
# Density of the matrix is set to 0.01 (1% non-zero elements)
sparse_matrix = sparse_random(500, 5000, density=0.01, format='csr')


# vec1 = [1, 2, 3, 0, 0]  # Row 0
# vec2 = [4, 0, 6, 0, 0]  # Row 1
# vec3 = [0, 1, 3, 7, 9]  # Row 2
# vec4 = [5, 0, 0, 0 ,2] # Row 3
# matrix = [ vec1 , vec2, vec3, vec3]
# sparse_matrix = csr_matrix(matrix)
mmwrite("sparse_matrix.mtx", sparse_matrix)

In [None]:
mi_matrix = mutual_information_matrix_serial(sparse_matrix, nbins=20, n_jobs=-1)
print(mi_matrix)
sparse_matrix_mi = csr_matrix(mi_matrix)
print(sparse_matrix_mi)
mmwrite("sparse_matrix_mi.mtx", sparse_matrix_mi)

In [4]:
from scipy.sparse import issparse
from scipy.stats import entropy
import numpy as np
from joblib import Parallel, delayed

def mutual_information_matrix_parallel(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and computes the full matrix (including the diagonal elements).
    """
    if not issparse(matrix):
        matrix = matrix.tocsr()

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(i, j, matrix, nbins=20):
        """
        Computes mutual information between row i and row j of the sparse matrix.
        """
        vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
        vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
        
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    # Parallelizing the pairwise mutual information computation
    jobs = [(i, j) for i in range(n_features) for j in range(i, n_features)]  # Includes diagonal
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_pairwise_mi)(i, j, matrix, nbins) for i, j in jobs
    )

    # Fill the matrix with the results
    for idx, (i, j) in enumerate(jobs):
        mi_matrix[i, j] = results[idx]
        mi_matrix[j, i] = results[idx]  # Exploit symmetry to avoid duplicate computation

    return mi_matrix


In [5]:
#from scipy.sparse import random as sparse_random
mi_matrix = mutual_information_matrix_parallel(sparse_matrix, nbins=20, n_jobs=-1)
#print(mi_matrix)
sparse_matrix_mi = csr_matrix(mi_matrix)
print(sparse_matrix_mi)
mmwrite("sparse_matrix_mi.mtx", sparse_matrix_mi)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 25000000 stored elements and shape (5000, 5000)>
  Coords	Values
  (0, 0)	0.11042251030392024
  (0, 1)	0.0015904277406188494
  (0, 2)	0.00210130012464943
  (0, 3)	0.0017793448224953934
  (0, 4)	0.00014333043106953824
  (0, 5)	0.0015678491145933449
  (0, 6)	0.0001300453326787132
  (0, 7)	0.00017215087334560009
  (0, 8)	0.00014069710655598144
  (0, 9)	0.00011968785516028313
  (0, 10)	0.0018167337390071758
  (0, 11)	0.0001093334426761905
  (0, 12)	0.00011722067029992789
  (0, 13)	0.00012757217679940092
  (0, 14)	0.00012262997139786402
  (0, 15)	0.00011458313823714539
  (0, 16)	0.0031663955246634956
  (0, 17)	0.0034982280083254436
  (0, 18)	0.00010933934195395123
  (0, 19)	0.0001301791999978874
  (0, 20)	0.00015387999677995978
  (0, 21)	0.0015625813269759536
  (0, 22)	0.001759228970672183
  (0, 23)	0.0001800775246897368
  (0, 24)	0.00011689299745473725
  :	:
  (4999, 4975)	0.00017116470788935967
  (4999, 4976)	0.0001383534431407

In [47]:
print(mi_matrix.shape)

(1000, 1000)
