In [1]:
import sys
import os
import pandas as pd
import numpy as np
import scipy
import gget
import sklearn.preprocessing 
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from importlib import reload
import matplotlib.pyplot as plt
from matplotlib import patches
import seaborn as sns
import networkx as nx
from matplotlib_venn import venn2
from matplotlib_venn import venn3
from scipy import sparse
import xgi

import warnings
warnings.simplefilter("ignore", category=RuntimeWarning)

source_path = os.path.abspath("../source/")
sys.path.append(source_path)
import utils as ut
import plotting as plt2
import hypercore as hc
import matrix as matrix
import centrality as central

ModuleNotFoundError: No module named 'utils'

# load 1D features 

In [None]:
dpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/1D_features/"
resolution = "1000000"

features = []
for f in os.listdir(dpath):
    if not resolution in f:
        continue
    dtype = f.split("_")[0]
    fpath = f"{dpath}{f}"
    fdf = pd.read_parquet(fpath)
    fdf = fdf.set_index('index')
    fdf.columns = [f"{dtype}_{x}" for x in fdf.columns]
    features.append(fdf)

features = pd.concat(features, axis=1)
features = features.fillna(0)

## feature scaling
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
features_scaled = pd.DataFrame(features_scaled, 
                               index=features.index, 
                               columns=features.columns)

feature_columns = features.columns # useful later
features_scaled = features_scaled.reset_index(names='bin')

print(f"{features_scaled.shape=}")
features_scaled.head()

# Load the gene information

In [None]:
"load pangloadb"
fpath = "../resources/mESC_pangloadb.csv"
pdf = pd.read_csv(fpath)
print(f"{pdf.shape=}")
print(f"{pdf['gene_name'].nunique()=}")
print()

""" load gene ontology annotations """
fpath = "../resources/stem_cell_population_maintenance.csv"
godf = pd.read_csv(fpath)
godf['gene_name'] = godf['gene_name'].str.upper()
print(f"{godf.shape=}")
print(f"{godf['gene_name'].nunique()=}")
print()

""" load gene expression """
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/rna/expression.parquet"
exdf = pd.read_parquet(fpath)

tpm_columns = [x for x in exdf.columns if "TPM" in x]
exdf['expression_mean'] = exdf[tpm_columns].mean(axis=1)
exdf['gene_name'] = exdf['gene_name'].str.upper()
express_map = dict(zip(exdf['gene_name'].values, exdf['expression_mean'].values))

exdf = exdf[exdf['expression_mean'] > 0]
print(f"{exdf.shape=}")
print(f"{exdf['gene_name'].nunique()=}")

""" Gene locations """
resolution = 1000000
chromosome = "1"

gene_table_path = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(gene_table_path)

gdf['gene_name'] = gdf['gene_name'].str.upper()

gdf = gdf[gdf['Chromosome'] == "1"]
gdf['bin'] = gdf['midpoint'].apply(lambda x: ut.bin_loci(x, resolution))


gdf['is_gene'] = [True] * len(gdf)
gdf['is_pt_gene'] = gdf['gene_biotype'] == 'protein_coding'
gdf['is_expressed'] = gdf['gene_name'].isin(exdf['gene_name'].values)
gdf['expression'] = gdf['gene_name'].map(express_map)
gdf['expression'] = gdf['expression'].fillna(0)
gdf['expression_log'] = np.log1p(gdf['expression'])
gdf['mESC_panglaoDB_marker'] = gdf['gene_name'].isin(pdf['gene_name'].values)
gdf['mESC_GO_marker'] = gdf['gene_name'].isin(godf['gene_name'].values)

gdf.head()

# load the population pore-c data

In [None]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/population_pore_c/chr1_1000000_incidence.parquet"

df = pd.read_parquet(fpath)
print(f"{df.shape=}")
df.head()

In [None]:
core_expanded = ut.fill_missing_bins(df, df.index)
A = matrix.clique_expand_incidence(core_expanded, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

print(f"{A.shape=}")

In [None]:
break

In [None]:
# extract the higher order contacts only
orders = df.sum(axis=0)
print(f"{orders.mean()=:.2f}")

# get the degree of higher order contacts
high_order_idx = np.argwhere(orders.values > 2).ravel()
n_higher_order = df.columns[high_order_idx].shape[0]
print(f"Number of higher-order contacts: {n_higher_order}")

H = df[df.columns[high_order_idx]]
print(f"{H.shape=}")

# Unweighetd Centrality (all hyper edges)

In [None]:
reload(central)

nodes = []
edges = []

measures = [
    'linear',
    'log-exp', 
    'max',
]

for func in measures:
    ncent, ecent = central.nonlinear_eigenvector_centrality(df.to_numpy(), 
                                                            function=func, 
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({func : ncent,}, index=df.index))
    edges.append(pd.DataFrame({func : ecent,}, index=df.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

print(f"{nodes.shape=}")
print(f"{edges.shape=}")

In [None]:
pdf = pd.merge(nodes, features_scaled)
pdf = pdf.set_index('bin')

# get correlations
corr = pdf.corr().abs()
corr = corr[measures]
corr = corr.loc[feature_columns]


plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 3, 3.5

sns.heatmap(data=corr,
            lw=1,
            cmap='coolwarm',
            annot=True,
            cbar_kws={'shrink' : 0.5, 'label' : 'Correlation'},
            )

plt.title('Unweighted Centrality (All Contacts)')

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 3
fig, axs = plt.subplots(1, 2)

sns.histplot(data=nodes.melt(id_vars='bin'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[0],
             )
axs[0].set_title('Node Centrality Distribution')

sns.histplot(data=edges.melt(id_vars='read_code'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[1]
            )
axs[1].set_title('Edge Centrality Distribution')
plt.tight_layout()

# Unweighted centrality, higher order hyperedges only

In [None]:
reload(central)

nodes = []
edges = []

measures = [
    'linear',
    'log-exp', 
    'max',
]

for func in measures:
    ncent, ecent = central.nonlinear_eigenvector_centrality(H.to_numpy(), 
                                                            function=func, 
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({func : ncent,}, index=H.index))
    edges.append(pd.DataFrame({func : ecent,}, index=H.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

print(f"{nodes.shape=}")
print(f"{edges.shape=}")

In [None]:
pdf = pd.merge(nodes, features_scaled)
pdf = pdf.set_index('bin')

# get correlations
corr = pdf.corr().abs()
corr = corr[measures]
corr = corr.loc[feature_columns]


plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 3, 3.5

sns.heatmap(data=corr,
            lw=1,
            cmap='coolwarm',
            annot=True,
            cbar_kws={'shrink' : 0.5, 'label' : 'Correlation'},
            )

plt.title('Unweighted Centrality (Higher-Order Contacts)')

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 3
fig, axs = plt.subplots(1, 2)

sns.histplot(data=nodes.melt(id_vars='bin'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[0],
             )
axs[0].set_title('Node Centrality Distribution')

sns.histplot(data=edges.melt(id_vars='read_code'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[1]
            )
axs[1].set_title('Edge Centrality Distribution')
plt.tight_layout()

In [None]:
# break

# Core Definition

In [None]:
""" Based on the above, there is significant information in the low-order contacts"""

nodes = []
edges = []

measures = [
    'linear',
    'log-exp', 
    'max',
]

for func in measures:
    ncent, ecent = central.nonlinear_eigenvector_centrality(df.to_numpy(), 
                                                            function=func, 
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({func : ncent,}, index=df.index))
    edges.append(pd.DataFrame({func : ecent,}, index=df.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

print(f"{nodes.shape=}")
print(f"{edges.shape=}")

In [None]:
def get_node_core(nodes, col='log-exp', threshold=0.9):
    """
    This function extracts a subset of edges based on a specified quantile threshold.

    Args:
        edges (DataFrame): DataFrame containing edge data.
        col (str): The column in 'edges' to use for filtering.
        threshold (float): The threshold used for filtering.

    Returns:
        np.ndarray: A NumPy array containing the 'read_code' values of the filtered edges.
    """
    node_core = nodes[nodes[col] > threshold]
    return node_core['bin'].values


def get_edge_core(edges, col='log-exp', threshold=0.9):
    """
    This function extracts a subset of edges based on a specified quantile threshold.

    Args:
        edges (DataFrame): DataFrame containing edge data.
        col (str): The column in 'edges' to use for filtering.
        threshold (float): The threshold used for filtering.

    Returns:
        np.ndarray: A NumPy array containing the 'read_code' values of the filtered edges.
    """
    edge_core = edges[edges[col] > threshold]
    return edge_core['read_code'].values


n_threshold = 0.6
e_threshold = 0.9

n_col = 'log-exp'
e_col = 'log-exp'

node_idx = get_node_core(nodes, col=n_col, threshold=n_threshold)
edge_idx = get_edge_core(edges, col=e_col, threshold=e_threshold)

core = df.loc[node_idx, edge_idx].copy()
print(f"{core.shape=}")

node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(core.T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core: Chromosome 1")

In [None]:
core_expanded = ut.fill_missing_bins(core, df.index)
A = matrix.clique_expand_incidence(core_expanded, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

print(f"{A.shape=}")

# A = matrix.normalize_oe(A)

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 5

sns.heatmap(np.log1p(A), 
            cmap='plasma',
            square=True, 
            cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
           )

plt.yticks([])
plt.xticks([])

plt.title('The Core (Clique-Expanded)')
plt.ylabel("Chromosome 1 Loci (1Mb)")
plt.xlabel("")

In [None]:
# break

# Core Sensitivity

In [None]:
edge_res = []
node_res = []
for func in measures:
    for t in np.linspace(0, 1, 100):
        e_idx = get_edge_core(edges, col=func, threshold=t)
        e_row = {
            'Function' : func,
            'Threshold' : t,
            'Edges' : len(e_idx),
        }
        edge_res.append(e_row)
        
        n_idx = get_node_core(nodes, col=func, threshold=t)
        n_row = {
            'Function' : func,
            'Threshold' : t,
            'Nodes' : len(n_idx),
        }
        node_res.append(n_row)
        
        
edge_res = pd.DataFrame(edge_res)
node_res = pd.DataFrame(node_res)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

fig, axs = plt.subplots(1, 2)


sns.lineplot(data=node_res, 
             x='Threshold',
             y='Nodes',
             hue='Function',
             style='Function',
             ax=axs[0],
            )

sns.lineplot(data=edge_res, 
             x='Threshold',
             y='Edges',
             hue='Function',
             style='Function',
             ax=axs[1],
            )

axs[0].set_title('Nodes in Core by Threshold')
axs[1].set_title('Edges in Core by Threshold')

plt.tight_layout()

In [None]:
reload(matrix)

def compute_core_metrics(df, nodes, edges, func='log-exp', n_bins=10):
    """
    Computes core metrics for a bipartite graph represented by a dataframe.

    Args:
        df (pd.DataFrame): Bipartite graph adjacency matrix.
        nodes (pd.DataFrame): Node metadata with a column specified by 'func'.
        edges (pd.DataFrame): Edge metadata with a column specified by 'func'.
        func (str, optional): Column name in 'nodes' and 'edges' for thresholding. Defaults to 'log-exp'.
        n_bins (int, optional): Number of bins for thresholding. Defaults to 10.

    Returns:
        pd.DataFrame: A dataframe containing computed core metrics.
    """
    res = []
    n, m = df.shape

    for n_t in np.linspace(0, 1, n_bins):
        for e_t in np.linspace(0, 1, n_bins):
            n_idx = get_node_core(nodes, col=func, threshold=n_t)
            e_idx = get_edge_core(edges, col=func, threshold=e_t)

            core = df.loc[n_idx, e_idx].copy()

            # Check for empty core
            if core.empty:
                continue  # Skip this iteration if the core is empty

            mean_degree = core.sum(axis=1).mean()
            mean_order = core.sum(axis=0).mean()

            nn, mm = core.shape

            # Minimum core size check
            if (nn > 2) & (mm > 2):
                L = matrix.hypergraph_laplacian(core)
                Lnorm = matrix.normalized_hypergraph_laplacian(core)
                
                # metrics
                fiedler_number = matrix.estimate_fiedler(Lnorm)
                entropy = matrix.hypergraph_entropy(L)
            else:
                fiedler_number = 0

            row = {
                'node_threshold': n_t,
                'edge_threshold': e_t,
                'n_nodes': nn,
                'n_edges': mm,
                'core_sum': core.sum().sum(),
                'core_shape': core.shape,
                'percent_original': core.size / df.size,
                'core_density': core.sum().sum() / core.size if core.size > 0 else 0,  # Avoid division by zero
                'percent_nodes': nn / n,
                'percent_edges': mm / m,
                'mean_degree': mean_degree,
                'mean_order': mean_order,
                'fiedler_number': fiedler_number,
                'entropy' : entropy,
            }
            res.append(row)

    res = pd.DataFrame(res)
    return res


def find_optimal_thresholds(metrics, weights):
    """
    Finds the optimal node and edge thresholds in a DataFrame based on a weighted objective function.

    Args:
        metrics (pd.DataFrame): DataFrame with metrics data (must include columns specified in `weights`).
        weights (dict): Dictionary of metric names and their corresponding weights (positive for maximization, negative for minimization).

    Returns:
        pd.Series: Row from metrics_df containing the optimal thresholds and corresponding metric values.
    """

    # Create the weighted objective function dynamically
    def weighted_objective(row):
        weighted_sum = 0
        for metric, weight in weights.items():
            weighted_sum += row[metric] * weight
        return weighted_sum

    # Apply the function and find the optimal row
    metrics['weighted_obj'] = metrics.apply(weighted_objective, axis=1)
    optimal_row = metrics.sort_values('weighted_obj', ascending=False).iloc[0]
    
    return optimal_row

metrics = compute_core_metrics(df, 
                               nodes, 
                               edges, 
                               func='log-exp',
                               n_bins=10)
print(f"{metrics.shape=}")
metrics.head()

# Compute all metrics

In [None]:
# Compute all metrics
n_bins = 15

all_metrics = {}

for func in measures:
    print(f"Working `{func}`...")
    metrics = compute_core_metrics(df, 
                                   nodes, 
                                   edges, 
                                   func=func,
                                   n_bins=n_bins,
                                  )
    
    all_metrics[func] = metrics
    
print('done!')
all_metrics['linear'].head()

In [None]:
reload(plt2)

x_col = "edge_threshold"    
y_col = "node_threshold"   

z_columns = [
    # "percent_original", 
    # "core_density",
    # "mean_degree",
    # "mean_order",
    "fiedler_number",
    "entropy",
]

plot_kwargs = {
    'cmap' : 'viridis',
}

for func in measures:
    metrics = all_metrics[func]
    
    azims = [30, 30]
    for i, z_col in enumerate(z_columns):
        perspective_kwargs = {
            'azim' : azims[i],
            'zoom' : 0.8
        }
        plt2.plot_3d_surface(metrics, 
                             x_col, 
                             y_col,
                             z_col,
                             plot_kwargs, 
                             perspective_kwargs,
                            )
        
        ax = plt.gca()

        # get the maximum value 
        weights = {
            z_col : 1,
        }
        max_row = find_optimal_thresholds(metrics, weights)
        print(max_row.to_markdown(headers=['Feature', 'Value']))

        ax.scatter(max_row[x_col], 
                   max_row[y_col], 
                   max_row[z_col], 
                   marker="o", c='r', s=40,
                   zorder=2)

        plt.title(func.title())
        plt.show()
        break

In [None]:
# break

# maximizing the fiedler value

In [None]:
func = 'linear'

metrics = all_metrics[func]

print(f"{metrics.shape=}")

# get the maximum value 
weights = {
    "fiedler_number" : 1,
}
max_row = find_optimal_thresholds(metrics, weights)
print(max_row.to_markdown(headers=['Feature', 'Value']))

n_threshold = max_row['node_threshold']
e_threshold = max_row['edge_threshold']

# actually build the core
node_idx = get_node_core(nodes, col=func, threshold=n_threshold)
edge_idx = get_edge_core(edges, col=func, threshold=e_threshold)

core = df.loc[node_idx, edge_idx].copy()
print(f"{core.shape=}")

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

plt2.plot_incidence(ut.sort_by_lowest_index(core), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core: Chromosome 1 (Maximizing Fiedler Value)")

In [None]:
core_expanded = ut.fill_missing_bins(core, df.index)
A = matrix.clique_expand_incidence(core_expanded, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

print(f"{A.shape=}")

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 5

sns.heatmap(np.log1p(A), 
            cmap='plasma',
            square=True, 
            cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
           )

plt.yticks([])
plt.xticks([])

plt.title('The Core (Clique-Expanded)')
plt.ylabel("Chromosome 1 Loci (1Mb)")
plt.xlabel("")

# Weighted Centrality

In [None]:
gdf.head()

In [None]:
# define node weights based on expression
node_weights = gdf.groupby('bin')['expression'].sum().reindex(df.index)
node_weights = node_weights.fillna(0)
node_weights = node_weights.values.reshape(-1, 1)

print(f"{node_weights.shape=}")

# define edge wieghts based on order
edge_weights = df.sum(axis=0)
edge_weights = edge_weights.values.reshape(-1, 1)

print(f"{edge_weights.shape=}")

nodes = []
edges = []

measures = [
    'linear',
    'log-exp', 
    'max',
]

for func in measures:
    ncent, ecent = central.nonlinear_eigenvector_centrality(df.to_numpy(), 
                                                            function=func, 
                                                            node_weights=node_weights,
                                                            # edge_weights=edge_weights,
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({func : ncent,}, index=df.index))
    edges.append(pd.DataFrame({func : ecent,}, index=df.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

nodes.head()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 3
fig, axs = plt.subplots(1, 2)

sns.histplot(data=nodes.melt(id_vars='bin'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[0],
             )
axs[0].set_title('Node Centrality Distribution')

sns.histplot(data=edges.melt(id_vars='read_code'),
             bins=51,
             kde=True,
             x='value',
             hue='variable',
             ax=axs[1]
            )

axs[1].set_title('Edge Centrality Distribution')
plt.tight_layout()

In [None]:
pdf = pd.merge(nodes, features_scaled)
pdf = pdf.set_index('bin')

# get correlations
corr = pdf.corr().abs()
corr = corr[measures]
corr = corr.loc[feature_columns]

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 3, 3.5

sns.heatmap(data=corr,
            lw=1,
            cmap='coolwarm',
            annot=True,
            cbar_kws={'shrink' : 0.5, 'label' : 'Correlation'},
            )

plt.title('Weighted Centrality')

In [None]:
# Compute all metrics
n_bins = 20

all_metrics = {}

for func in measures:
    print(f"Working `{func}`...")
    metrics = compute_core_metrics(df, 
                                   nodes, 
                                   edges, 
                                   func=func,
                                   n_bins=n_bins,
                                  )
    
    all_metrics[func] = metrics
    
all_metrics['linear'].head()

In [None]:
reload(plt2)

func = 'max'
metrics = all_metrics[func]

plot_kwargs = {
    'cmap' : 'viridis',
    'zorder' : 1,
}

perspective_kwargs = {
    'azim' : 30,
    'zoom' : 0.8
}

x_col = "edge_threshold"    
y_col = "node_threshold"   
z_col = "fiedler_number"
  
plt2.plot_3d_surface(metrics, 
                     x_col, 
                     y_col,
                     z_col,
                     plot_kwargs, 
                     perspective_kwargs,
                    )
ax = plt.gca()

# get the maximum value 
weights = {
    z_col : 1,    
}

optim = find_optimal_thresholds(metrics, weights)
print(optim.to_markdown(headers=['Feature', 'Value']))

ax.scatter(optim[x_col], 
           optim[y_col],
           optim[z_col],
           marker="o", c='r', s=40,
           zorder=2)

In [None]:
func = 'max'
metrics = all_metrics[func]


weights = {
    # 'node_threshold': 1,     # Maximize
    'edge_threshold': 0.1,     # Maximize
    'fiedler_number': 1,     # Maximize
    'percent_original': -1,     # Minimize
    # 'entropy': -1,         # Minimize
}

optim = find_optimal_thresholds(metrics, weights)
print(optim.to_markdown())
print()

node_idx = get_node_core(nodes, col=func, threshold=optim['node_threshold'])
edge_idx = get_edge_core(edges, col=func, threshold=optim['edge_threshold'])

core = df.loc[node_idx, edge_idx].copy()
print(f"{core.shape=}")

# Workroom

In [None]:
func = 'linear'

# define node weights based on expression
node_weight_df = features_scaled.copy()
node_weight_df = node_weight_df[node_weight_df['bin'] > 2]
node_weight_df = node_weight_df.set_index('bin')
print(f"{node_weight_df.shape=}")

nodes = []
edges = []

for feature in node_weight_df.columns:
    
    weight_vector = node_weight_df[feature].fillna(0)
    weight_vector = np.log1p(weight_vector)
    weight_vector = weight_vector.values.reshape(-1, 1)
    
    ncent, ecent = central.nonlinear_eigenvector_centrality(df.to_numpy(), 
                                                            function=func, 
                                                            node_weights=weight_vector,
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({feature : ncent,}, index=df.index))
    edges.append(pd.DataFrame({feature : ecent,}, index=df.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

nodes.head()

In [None]:
pdf = nodes.set_index('bin')

Z = linkage(pdf.to_numpy(), method='ward')

max_d = 2

clusters = fcluster(Z, max_d, criterion='distance')
row_colors = clusters
row_colors = plt2.floats_to_colors(row_colors, colormap='tab20b')

sns.clustermap(data=pdf.T,
               col_colors=row_colors,
               col_linkage=Z,
               figsize=(15, 7),
               cmap='coolwarm',
               row_cluster=False)

plt.ylabel("")

In [None]:
break

In [None]:
pdf = pd.merge(nodes, features_scaled)
pdf = pdf.set_index('bin')

# get correlations
corr = pdf.corr().abs()
corr = corr[node_weight_df.columns]
corr = corr.loc[feature_columns]

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 3, 3.5

sns.heatmap(data=corr,
            lw=1,
            cmap='coolwarm',
            annot=True,
            cbar_kws={'shrink' : 0.5, 'label' : 'Correlation'},
            )

plt.title('Weighted Centrality')

In [None]:
gdf.tail()

# Archive

In [None]:
break

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(core.T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core: Chromosome 1 (Maximizing Fiedler Value)")

In [None]:
reload(plt2)
gdf['in_core'] = gdf['bin'].isin(node_idx)

plt2.plot_venn3_from_df(gdf, 
                   col1='is_tf',
                   col2='is_expressed', 
                   col3='in_core',
                   set_labels=['TF', 'Expressed', 'Core']
                  )

In [None]:
break

In [None]:
metrics.columns

In [None]:
break

In [None]:
max_row = metrics.loc[metrics['fiedler_number'].idxmax()]
n_threshold = max_row['node_threshold']
e_threshold = max_row['edge_threshold']

print(f"node_threshold={n_threshold:.3}")
print(f"edge_threshold={e_threshold:.3}")

node_idx = get_node_core(nodes, col=func, threshold=n_threshold)
edge_idx = get_edge_core(edges, col=func, threshold=e_threshold)

core = df.loc[node_idx, edge_idx].copy()
print(f"{core.shape=}")

In [None]:
reload(matrix)
L = matrix.hypergraph_laplacian(core)


def hypergraph_entropy(L):
    """
    Calculates the hypergraph entropy from the hypergraph Laplacian matrix L.

    Args:
        L (scipy.sparse.csr_matrix): The hypergraph Laplacian matrix as a sparse CSR matrix.

    Returns:
        float: The hypergraph entropy.
    """
    # Get eigenvalues and eigenvectors of L. For sparse matrices, using `eigsh` is efficient
    eigenvalues, _ = scipy.sparse.linalg.eigsh(L, k=L.shape[0]-1, which='SM')

    # Ensure proper handling of small or negative values close to zero to avoid numerical issues.
    eigenvalues = np.maximum(eigenvalues, 0)

    # Normalize eigenvalues. This ensures sum of normalized eigenvalues equals 1
    normalized_eigenvalues = eigenvalues / eigenvalues.sum()

    # Calculate hypergraph entropy using the formula given in the image
    # Note: The `where` condition ensures correct handling of log(0)
    entropy = -np.sum(np.where(normalized_eigenvalues > 0, normalized_eigenvalues * np.log(normalized_eigenvalues), 0))

    return entropy

hypergraph_entropy(L)

In [None]:
break

# Genes and the Core

In [None]:
reload(plt2)
gdf['in_core'] = gdf['bin'].isin(node_idx)

plt2.plot_venn3_from_df(gdf, 
                   col1='is_tf',
                   col2='is_expressed', 
                   col3='in_core',
                   set_labels=['TF', 'Expressed', 'Core']
                  )

In [None]:
break

In [None]:
# need to overwrite the core column

def plot_venn3_from_df(df, col1, col2, col3, set_labels=None, title="Venn Diagram"):
    """Plots a 3-way Venn diagram from boolean columns in a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the boolean columns.
        col1, col2, col3 (str): Names of the columns to use for the Venn diagram.
        set_labels (list, optional): Labels for the sets (defaults to column names).
        title (str, optional): Title for the Venn diagram.
    """
    
    # Calculate values for each region of the Venn diagram
    set1 = df[col1].sum()
    set2 = df[col2].sum()
    set3 = df[col3].sum()
    
    set1_only = ((df[col1]) & ~(df[col2]) & ~(df[col3])).sum()
    set2_only = ((df[col2]) & ~(df[col1]) & ~(df[col3])).sum()
    set3_only = ((df[col3]) & ~(df[col1]) & ~(df[col2])).sum()
    
    set12 = ((df[col1]) & (df[col2]) & ~(df[col3])).sum()
    set13 = ((df[col1]) & (df[col3]) & ~(df[col2])).sum()
    set23 = ((df[col2]) & (df[col3]) & ~(df[col1])).sum()
    
    set123 = ((df[col1]) & (df[col2]) & (df[col3])).sum()

    # Create the Venn diagram
    if set_labels is None:
        set_labels = (col1, col2, col3)  # Use column names as default labels

    plt.figure(figsize=(8, 8))
    venn3(subsets=(set1_only, 
                   set2_only,
                   set12,
                   set3_only,
                   set13,
                   set23, 
                   set123), set_labels=set_labels)
    plt.show()
    
    

gdf['in_core'] = gdf['bin'].isin(node_idx)
plot_venn3_from_df(gdf, 
                   col1='is_tf',
                   col2='is_expressed', 
                   col3='in_core',
                   set_labels=['TF', 'Expressed', 'Core']
                  )

gdf['in_core'] = gdf['bin'].isin(node_idx)

plot_venn3_from_df(gdf, 
                   col1='is_tf',
                   col2='is_expressed', 
                   col3='in_core',
                   set_labels=['TF', 'Expressed', 'Core']
                  )

# Weighted Centrality

Weighted by the gene expression values

In [None]:
expression_by_bin = gdf.groupby('bin')['expression_log'].sum().reindex(df.index)
expression_by_bin = expression_by_bin.fillna(0)
expression_by_bin = expression_by_bin.values.reshape(-1, 1)

print(f"{expression_by_bin.shape=}")

reload(central)

nodes = []
edges = []

measures = [
    'linear',
    'log-exp', 
    'max',
]

for func in measures:
    ncent, ecent = central.nonlinear_eigenvector_centrality(H.to_numpy(), 
                                                            function=func, 
                                                            node_weights=expression_by_bin,
                                                            maxiter=1000)
    ncent = ut.min_max(ncent)
    ecent = ut.min_max(ecent)
    nodes.append(pd.DataFrame({func : ncent,}, index=H.index))
    edges.append(pd.DataFrame({func : ecent,}, index=H.columns))
    
nodes = pd.concat(nodes, axis=1).reset_index()
edges = pd.concat(edges, axis=1).reset_index()

print(f"{nodes.shape=}")
print(f"{edges.shape=}")

nodes.head()

In [None]:
# load 1D features 
dpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/1D_features/"
resolution = "1000000"

features = []
for f in os.listdir(dpath):
    if not resolution in f:
        continue
    dtype = f.split("_")[0]
    fpath = f"{dpath}{f}"
    fdf = pd.read_parquet(fpath)
    fdf = fdf.set_index('index')
    fdf.columns = [f"{dtype}_{x}" for x in fdf.columns]
    features.append(fdf)

features = pd.concat(features, axis=1)
features = features.fillna(0)

## feature scaling
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
features_scaled = pd.DataFrame(features_scaled, 
                               index=features.index, 
                               columns=features.columns)

feature_columns = features.columns
feature_columns = [x for x in feature_columns if not "RNA" in x]

features_scaled = features_scaled.reset_index(names='bin')

print(f"{features_scaled.shape=}")
features_scaled.head()

In [None]:
pdf = pd.merge(nodes, features_scaled)
pdf = pdf.set_index('bin')

# get correlations
corr = pdf.corr().abs()
corr = corr[measures]
corr = corr.loc[feature_columns]


plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 3, 3.5

sns.heatmap(data=corr,
            lw=1,
            cmap='coolwarm',
            annot=True,
            cbar_kws={'shrink' : 0.5, 'label' : 'Correlation'},
            )

plt.title('Weighted Centrality (by RNA Expression)')

In [None]:
# correlation with other features

In [None]:
break

In [None]:
func = 'log-exp'
n_bins = 20

n, m = df.shape

res = []

for n_t in np.linspace(0,1, n_bins):
    for e_t in np.linspace(0,1, n_bins):
        n_idx = get_node_core(nodes, col=func, threshold=n_t)
        e_idx = get_edge_core(edges, col=func, threshold=e_t)
        
        core = df.loc[n_idx, e_idx].copy()

        nn, mm = core.shape
        
        # doesn't make sense for tiny cores.....
        if (nn > 3) & (mm > 3):
            L = normalized_hypergraph_laplacian(core)
            fiedler_number = estimate_fiedler(L)
        else:
            fiedler_number = 0
            
        row = {
            'node_threshold' : n_t,
            'edge_threshold' : e_t,
            'n_nodes' : nn,
            'n_edges' : mm,
            'fiedler_number' : fiedler_number,
        }
        res.append(row)

res = pd.DataFrame(res)
print(f"{res.shape=}")
res.head()

In [None]:
max_row = res.loc[res['fiedler_number'].idxmax()]
n_threshold = max_row['node_threshold']
e_threshold = max_row['edge_threshold']

print(f"node_threshold={n_threshold:.3}")
print(f"edge_threshold={e_threshold:.3}")

node_idx = get_node_core(nodes, col=func, threshold=n_threshold)
edge_idx = get_edge_core(edges, col=func, threshold=e_threshold)

core = H.loc[node_idx, edge_idx].copy()
print(f"{core.shape=}")

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

plt2.plot_incidence(ut.sort_by_lowest_index(core), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core: Chromosome 1 (Maximizing Fiedler Value)")
plt.show()

core_expanded = ut.fill_missing_bins(core, df.index)
Acore = matrix.clique_expand_incidence(core_expanded, zero_diag=False) 
Acore = Acore.sort_index(axis=1)
Acore = Acore.sort_index(axis=0)

print(f"{A.shape=}")

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 5

sns.heatmap(np.log1p(Acore), 
            cmap='plasma',
            square=True, 
            cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
           )

plt.yticks([])
plt.xticks([])

plt.title('The Core (Clique-Expanded)')
plt.ylabel("Chromosome 1 Loci (1Mb)")
plt.xlabel("")

In [None]:
# # need to overwrite the core column

# gdf['in_core'] = gdf['bin'].isin(node_idx)

# plot_venn3_from_df(gdf, 
#                    col1='is_tf',
#                    col2='is_expressed', 
#                    col3='in_core',
#                    set_labels=['TF', 'Expressed', 'Core']
#                   )

In [None]:
break

In [None]:
core_genes = gdf[gdf['in_core']]
core_genes = core_genes[core_genes['gene_biotype'] == 'protein_coding']
core_genes = core_genes[core_genes['expression'] > 0]
print(f"{core_genes.shape=}")

print(f"{core_genes['is_tf'].sum()=}")
print(f"{core_genes['mESC_panglaoDB_marker'].sum()=}")
print(f"{core_genes['mESC_GO_marker'].sum()=}")

core_genes = core_genes.sort_values(by='expression', ascending=False)
core_genes.head()

In [None]:
n_genes = 10
database = 'celltypes'
genes = core_genes['gene_name'].head(n_genes).values
print(genes)
edf = gget.enrichr(genes, database=database)
edf

# compare with Hi-C

In [None]:
# compare against population Hi-C
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/population_hic/chr1_1000000.parquet"

hic = pd.read_parquet(fpath)
A = hic.to_numpy()
print(f"{A.shape=}")

# drop the telomere
A = A[3:, :][:, 3:]
print(f"{A.shape=}")

Anorm = matrix.normalize_oe(matrix.normalize_kr(A).todense())

# correct outliers
top = 10
row_idx, col_idx = matrix.get_sorted_upper_triangle_indices(Anorm)

# update the matrix 
for i in range(top):
    Anorm[row_idx[i], col_idx[i]] = Anorm.mean()

print(f"{A.shape=}")

plt.imshow(np.log1p(A))

In [None]:
mask = Acore != 0

# Apply the mask
masked_arr = np.where(Acore != 0, A, 0)


fig, axs = plt.subplots(1, 2)

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 10, 5

sns.heatmap(np.log1p(Acore), 
            cmap='plasma',
            square=True, 
            cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
            ax=axs[0]
           )

axs[0].set_ylabel("")
axs[0].set_xlabel("")
axs[0].set_yticks([])
axs[0].set_xticks([])
axs[0].set_title('The Core')

sns.heatmap(np.log1p(masked_arr), 
            cmap='plasma',
            square=True, 
            cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
            ax=axs[1]
           )

axs[1].set_ylabel("")
axs[1].set_xlabel("")
axs[1].set_yticks([])
axs[1].set_xticks([])
axs[1].set_title('Hi-C')

plt.tight_layout()

In [None]:
# what if we chose a similar threshold from the Hi-C data?

eigenvalues, eigenvector = np.linalg.eigh(Anorm)
pc1 = np.ravel(ut.min_max(eigenvector[:, -1]))

r, p = scipy.stats.pearsonr(pc1, nodes['log-exp'].values)
print(f"Correlation: {r=:.3f} ({p=:.5f})")

plt.plot(pc1)
plt.plot(nodes['log-exp'])

In [None]:
# choose the nodes based on the same criteria (fiedler value maximimzation)

res = []

for t in np.linspace(0, 1, n_bins):
    node_idx = np.ravel(np.argwhere(pc1 > t))
    if len(node_idx) > 3:
        core_hic = Anorm[node_idx, :][:, node_idx]
        
        L = scipy.sparse.csgraph.laplacian(core_hic, normed=True)
    

### 