In [None]:
import numpy as np
from scipy.stats import entropy
from scipy.sparse import issparse
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
mi_matrix_sparse = csr_matrix(mi_matrix)

def mutual_information_matrix_serial(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and only computes the upper triangular part of the matrix.
    """
    if not issparse(matrix):
        matrix = csr_matrix(matrix)

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(vi, vj, nbins=20):
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    for i in range(n_features):
        for j in range(i, n_features):
            vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
            vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
            mi_matrix[i, j] = compute_pairwise_mi(vi, vj, nbins=nbins)
            if i != j:
                mi_matrix[j, i] = mi_matrix[i, j]  # Exploit symmetry
    return mi_matrix


In [2]:
from scipy.sparse import random as sparse_random
from scipy.io import mmwrite

# Generate a sparse random matrix with 1000 rows and 5000 columns
# Density of the matrix is set to 0.01 (1% non-zero elements)
sparse_matrix = sparse_random(10, 50, density=0.01, format='csr')

# Checking the matrix shape
sparse_matrix.shape

mmwrite("sparse_matrix.mtx", sparse_matrix)

In [3]:
#from scipy.sparse import random as sparse_random
#sparse_matrix = sparse_random(100, 1000, density=0.01, format='csr')  # Example sparse matrix
mi_matrix = mutual_information_matrix_serial(sparse_matrix, nbins=20, n_jobs=-1)
print(mi_matrix)

[[1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.01435288e-05 1.01435288e-05 1.01435288e-05 1.01435288e-05
  2.82301829e-01 1.19995422e-03 1.01435288e-05 1.01435288e-05
  1.01435288e-05 1.42051349e-01]
 [1.03951442e-05 1.03951442e-05 1.03951442e-05 1.03951442e-05
  1.19995422e-03 1.41450686e-01 1.03951442e-05 1.03951442e-05
  1.03951442e-05 1.19995422e-03]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e

In [4]:
from scipy.sparse import issparse
from scipy.stats import entropy
import numpy as np
from joblib import Parallel, delayed

def mutual_information_matrix_parallel(matrix, nbins=20, n_jobs=-1):
    """
    Computes the mutual information matrix in parallel, working directly with sparse matrices,
    and only computes the upper triangular part of the matrix.
    """
    if not issparse(matrix):
        matrix = matrix.tocsr()

    n_features = matrix.shape[0]
    mi_matrix = np.zeros((n_features, n_features))

    def compute_pairwise_mi(i, j, matrix, nbins=20):
        """
        Computes mutual information between row i and row j of the sparse matrix.
        """
        vi = matrix[i, :].toarray().flatten() if issparse(matrix) else matrix[i, :]
        vj = matrix[j, :].toarray().flatten() if issparse(matrix) else matrix[j, :]
        
        joint_counts, _, _ = np.histogram2d(vi, vj, bins=nbins)
        if joint_counts.sum() == 0:
            return 0  # No mutual information if no overlap
        joint_prob = joint_counts / (joint_counts.sum() + 1e-8)

        marginal_i = joint_prob.sum(axis=1) + 1e-8
        marginal_j = joint_prob.sum(axis=0) + 1e-8

        h_xy = entropy(joint_prob.flatten(), base=2)
        h_x = entropy(marginal_i, base=2)
        h_y = entropy(marginal_j, base=2)

        return float(h_x + h_y - h_xy)

    # Parallelizing the pairwise mutual information computation for the upper triangular matrix
    jobs = [(i, j) for i in range(n_features) for j in range(i + 1, n_features)]
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_pairwise_mi)(i, j, matrix, nbins) for i, j in jobs
    )

    # Fill the upper triangular matrix with the results
    for idx, (i, j) in enumerate(jobs):
        mi_matrix[i, j] = results[idx]
        mi_matrix[j, i] = results[idx]  # Exploit symmetry to avoid duplicate computation

    return mi_matrix


In [5]:
#from scipy.sparse import random as sparse_random
mi_matrix = mutual_information_matrix_parallel(sparse_matrix, nbins=20, n_jobs=-1)
print(mi_matrix)


[[0.00000000e+00 1.06468835e-05 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 0.00000000e+00 1.06468835e-05 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 1.06468835e-05 0.00000000e+00 1.06468835e-05
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 0.00000000e+00
  1.01435288e-05 1.03951442e-05 1.06468835e-05 1.06468835e-05
  1.06468835e-05 1.01435288e-05]
 [1.01435288e-05 1.01435288e-05 1.01435288e-05 1.01435288e-05
  0.00000000e+00 1.19995422e-03 1.01435288e-05 1.01435288e-05
  1.01435288e-05 1.42051349e-01]
 [1.03951442e-05 1.03951442e-05 1.03951442e-05 1.03951442e-05
  1.19995422e-03 0.00000000e+00 1.03951442e-05 1.03951442e-05
  1.03951442e-05 1.19995422e-03]
 [1.06468835e-05 1.06468835e-05 1.06468835e-05 1.06468835e

In [6]:
from scipy.sparse import csr_matrix
mi_matrix_sparse = csr_matrix(mi_matrix)

mmwrite("mi_matrix.mtx", mi_matrix_sparse)

In [7]:
mi_matrix.shape

(10, 10)

In [11]:
vec1 = [1, 2, 3, 0, 0]  # Row 0
vec2 = [4, 0, 6, 0, 0]  # Row 1
matrix = [ vec1 , vec2]
mi_matrix = mutual_information_matrix_serial(matrix, nbins=20, n_jobs=-1)




AttributeError: 'list' object has no attribute 'tocsr'

In [9]:
mi_matrix

array([[1.06468835e-05, 1.06468835e-05, 1.06468835e-05, 1.06468835e-05,
        1.01435288e-05, 1.03951442e-05, 1.06468835e-05, 1.06468835e-05,
        1.06468835e-05, 1.01435288e-05],
       [1.06468835e-05, 1.06468835e-05, 1.06468835e-05, 1.06468835e-05,
        1.01435288e-05, 1.03951442e-05, 1.06468835e-05, 1.06468835e-05,
        1.06468835e-05, 1.01435288e-05],
       [1.06468835e-05, 1.06468835e-05, 1.06468835e-05, 1.06468835e-05,
        1.01435288e-05, 1.03951442e-05, 1.06468835e-05, 1.06468835e-05,
        1.06468835e-05, 1.01435288e-05],
       [1.06468835e-05, 1.06468835e-05, 1.06468835e-05, 1.06468835e-05,
        1.01435288e-05, 1.03951442e-05, 1.06468835e-05, 1.06468835e-05,
        1.06468835e-05, 1.01435288e-05],
       [1.01435288e-05, 1.01435288e-05, 1.01435288e-05, 1.01435288e-05,
        2.82301829e-01, 1.19995422e-03, 1.01435288e-05, 1.01435288e-05,
        1.01435288e-05, 1.42051349e-01],
       [1.03951442e-05, 1.03951442e-05, 1.03951442e-05, 1.03951442e-05,
   