In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
import gget
import scipy
import matplotlib.patches as patches
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import fisher_exact
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as mpatches
import matplotlib.cm as cm
from pycirclize import Circos
from scipy.interpolate import splprep, splev
import networkx as nx
import random
from importlib import reload
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
from itertools import combinations
import ot
from scipy.spatial.distance import pdist, squareform
from matplotlib.colors import ListedColormap

import surprise as sup

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

"""WARNING: no warnings"""
warnings.filterwarnings("ignore")

# local imports
import anndata_utils as anntools

source_path = os.path.abspath("../source/")
sys.path.append(source_path)
import centrality as central
import matrix
import utils as ut
import plotting as plt2

# load marker genes

In [2]:
def load_pathway(fpath):
    """
    Loads an Enrichr-like database file into a boolean DataFrame.

    Args:
        fpath (str): Path to the Enrichr-like database file.

    Returns:
        pandas.DataFrame: A boolean DataFrame where:
            - Index: Genes
            - Columns: Pathways
            - Values: True if the gene is in the pathway, False otherwise.
    """

    result = []
    with open(fpath,  encoding='utf-8') as f:
        for line in f:
            split_line = [x for x in line.strip().split('\t') if x]  # Remove empty strings directly

            row = {'label': split_line[0]}
            for gene in split_line[1:]:
                row[gene] = 1

            result.append(row)

    df = pd.DataFrame(result)
    df = df.fillna(0.0).set_index('label').astype(bool).T  # Chained operations for clarity

    return df

fpath = "../../ONT-single-cell/resources/PanglaoDB_Augmented_2021.txt"
pdf = load_pathway(fpath)
stem_genes = list(pdf[pdf['Embryonic Stem Cells']].index)
stem_genes = [x.title() for x in stem_genes]
stem_genes[:10]

['Gjb1',
 'Amotl2',
 'Yap1',
 'Fbln1',
 'Uaca',
 'Antxr1',
 'Fermt2',
 'Serpinh1',
 'Pls3',
 'Gpx8']

# Load scores and expression

In [3]:
""" LOAD centrality """
resolution = 1000000
fpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/global_core_score/population_mESC_{resolution}_scores.csv"
scores = pd.read_csv(fpath)
print(f"{scores.shape=}")

""" CREATE the bin map for global sorting """
bin_map = dict(zip(scores['bin_name'].values, scores['bin'].values))

""" LOAD expression """
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/expression_table/rna_table.parquet"
tdf = pd.read_parquet(fpath)
print(f"(raw) {tdf.shape=}")
print(f"(filtered) {tdf.shape=}")

expression_map = dict(zip(tdf['gene_name'].values, tdf['TPM'].values))

tdf.sample(5)

scores.shape=(2431, 36)
(raw) tdf.shape=(51883, 19)
(filtered) tdf.shape=(51883, 19)


Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation,ens_gene_id,gene_name
6280,ENSMUSG00000027746.13,"ENSMUST00000029309.12,ENSMUST00000122330.1,ENS...",996.08,745.23,619.0,61.51,45.88,618.98,0.15,60.61,47.39,48.7669,72.542,0.067113,38.2721,56.8637,0.067058,ENSMUSG00000027746,Ufm1
31448,ENSMUSG00000094342.2,"ENSMUST00000178144.1,ENSMUST00000186631.1",336.0,123.17,0.0,0.0,0.0,0.0,0.0,0.28,0.22,3e-06,0.841215,0.655416,3e-06,0.657775,0.655401,ENSMUSG00000094342,Gm17522
20482,ENSMUSG00000074283.5,"ENSMUST00000037448.6,ENSMUST00000206362.1,ENSM...",2378.65,2126.89,81.0,2.82,2.1,81.0,0.0,2.75,2.15,2.11016,3.41601,0.081786,1.64535,2.66746,0.08173,ENSMUSG00000074283,Zfp109
26865,ENSMUSG00000085886.1,ENSMUST00000148318.1,2536.0,2284.24,4.0,0.13,0.1,4.0,0.0,0.15,0.12,0.038109,0.291886,0.302365,0.029778,0.228216,0.302576,ENSMUSG00000085886,D030047H15Rik
29988,ENSMUSG00000091277.2,ENSMUST00000169406.2,1184.0,932.24,6.81,0.54,0.4,7.54,2.39,0.64,0.5,0.166885,1.21909,0.298526,0.130474,0.953441,0.298568,ENSMUSG00000091277,Gm1818


# Load hyperedges from population

In [4]:
""" LOAD the TF list """
fpath = "/nfs/turbo/umms-indikar/shared/projects/twin_cell/data/b_matrix/SCENIC/scenic/tf_lists/allTFs_mm.txt"
tf_list = [x.strip() for x in open(fpath)]

""" LOAD the genes """
fpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/lightweight/population_mESC_{resolution}_gdf.parquet"
df = pd.read_parquet(fpath)
df['is_tf'] = df['gene_name'].isin(tf_list)
df['is_marker'] = df['gene_name'].isin(stem_genes)

""" MERGE the core scores """
merge_columns = [
    'bin_name',
    'ATACSeq_1',
    'CTCF',
    'H3K27ac', 
    'H3K27me3',
    'RNA_5',
    'global_hge_logexp_RNA_weighted'
]

df = pd.merge(
    df, scores[merge_columns].fillna(0.0), how="left",
)

""" DEFINE the core """
score_column = 'global_hge_logexp_RNA_weighted'
core_threshold_quantile = 0.75
threshold = np.quantile(df[score_column].fillna(0.0), core_threshold_quantile)
df['is_core'] = df[score_column].fillna(0.0) > threshold
df = df.rename(columns={
    'global_hge_logexp_RNA_weighted' : 'core_score',
})

print(df['is_core'].value_counts())

"""ADD some convience columns """
df['chrom'] = df['bin_name'].str.split(":").str[0]
df['bin'] = df['bin_name'].map(bin_map)
df['expression'] = df['gene_name'].map(expression_map)
df['order'] = df.groupby('read_name')['gene_name'].transform('nunique')
df['degree'] = df.groupby('gene_name')['read_name'].transform('nunique')
df['is_pt'] = (df['gene_biotype'] == 'protein_coding')

print(f"{df.shape=}")
df.head()

is_core
False    3704772
True     1234495
Name: count, dtype: int64
df.shape=(4939267, 19)


Unnamed: 0,gene_name,gene_biotype,read_name,bin_name,is_tf,is_marker,ATACSeq_1,CTCF,H3K27ac,H3K27me3,RNA_5,core_score,is_core,chrom,bin,expression,order,degree,is_pt
0,Ulk4,protein_coding,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,chr9:121,False,False,0.826484,1.149226,1.349552,0.866066,0.573875,0.254849,False,chr9,1394.0,4.12,2,1236,True
1,Smarca2,protein_coding,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,chr19:26,False,False,0.497386,0.547185,0.336787,0.839273,0.141617,0.157878,False,chr19,2436.0,4.55,11,609,True
2,Dlgap3,protein_coding,ad5b2240-893f-4ed0-a157-c2be66d8d754,chr4:127,False,False,0.754788,1.027046,1.577616,0.839461,0.906364,0.491722,True,chr4,665.0,87.94,5,262,True
3,Gm56531,lncRNA,3f354c45-5e48-4f6d-8c7e-05369432b344,chr12:8,False,False,0.690311,0.851758,0.774613,0.833768,0.617768,0.310082,False,chr12,1659.0,,1,155,False
4,Twsg1,protein_coding,d4626feb-16a2-4aac-8145-53e89b60bf7c,chr17:66,False,False,0.767515,0.921479,1.337415,0.571748,0.508463,0.28903,False,chr17,2289.0,23.16,1,140,True


# Save the reads

In [5]:
start_time = time.time()

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/read_table_{resolution}.parquet"

df.to_parquet(outpath, index=False)

end_time = time.time()
total_time = end_time - start_time

print(f"DataFrame saved to: {outpath}")
print(f"Total time taken: {total_time:.2f} seconds")

print("\nFirst 5 rows of the DataFrame:")
df.head()

DataFrame saved to: /scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/read_table_1000000.parquet
Total time taken: 5.55 seconds

First 5 rows of the DataFrame:


Unnamed: 0,gene_name,gene_biotype,read_name,bin_name,is_tf,is_marker,ATACSeq_1,CTCF,H3K27ac,H3K27me3,RNA_5,core_score,is_core,chrom,bin,expression,order,degree,is_pt
0,Ulk4,protein_coding,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,chr9:121,False,False,0.826484,1.149226,1.349552,0.866066,0.573875,0.254849,False,chr9,1394.0,4.12,2,1236,True
1,Smarca2,protein_coding,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,chr19:26,False,False,0.497386,0.547185,0.336787,0.839273,0.141617,0.157878,False,chr19,2436.0,4.55,11,609,True
2,Dlgap3,protein_coding,ad5b2240-893f-4ed0-a157-c2be66d8d754,chr4:127,False,False,0.754788,1.027046,1.577616,0.839461,0.906364,0.491722,True,chr4,665.0,87.94,5,262,True
3,Gm56531,lncRNA,3f354c45-5e48-4f6d-8c7e-05369432b344,chr12:8,False,False,0.690311,0.851758,0.774613,0.833768,0.617768,0.310082,False,chr12,1659.0,,1,155,False
4,Twsg1,protein_coding,d4626feb-16a2-4aac-8145-53e89b60bf7c,chr17:66,False,False,0.767515,0.921479,1.337415,0.571748,0.508463,0.28903,False,chr17,2289.0,23.16,1,140,True


# make gene and read references

In [6]:
columns = [
    'gene_name', 
    'gene_biotype', 
    'bin_name', 
    'chrom',
    'bin',
    'is_pt',
    'is_tf',
    'is_marker',
    'is_core', 
    'core_score',
    'ATACSeq_1', 
    'CTCF', 
    'H3K27ac', 
    'H3K27me3',
    'RNA_5',
    'expression', 
    'degree'
]
nodes = df[columns].copy()
nodes = nodes.drop_duplicates()
print(f"{nodes.shape=}")

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/nodes_{resolution}.parquet"
nodes.to_parquet(outpath, index=False)
nodes.head()

nodes.shape=(48681, 17)


Unnamed: 0,gene_name,gene_biotype,bin_name,chrom,bin,is_pt,is_tf,is_marker,is_core,core_score,ATACSeq_1,CTCF,H3K27ac,H3K27me3,RNA_5,expression,degree
0,Ulk4,protein_coding,chr9:121,chr9,1394.0,True,False,False,False,0.254849,0.826484,1.149226,1.349552,0.866066,0.573875,4.12,1236
1,Smarca2,protein_coding,chr19:26,chr19,2436.0,True,False,False,False,0.157878,0.497386,0.547185,0.336787,0.839273,0.141617,4.55,609
2,Dlgap3,protein_coding,chr4:127,chr4,665.0,True,False,False,True,0.491722,0.754788,1.027046,1.577616,0.839461,0.906364,87.94,262
3,Gm56531,lncRNA,chr12:8,chr12,1659.0,False,False,False,False,0.310082,0.690311,0.851758,0.774613,0.833768,0.617768,,155
4,Twsg1,protein_coding,chr17:66,chr17,2289.0,True,False,False,False,0.28903,0.767515,0.921479,1.337415,0.571748,0.508463,23.16,140


In [7]:
edges = df.groupby('read_name').agg(
    order = ('gene_name', 'nunique'),
    n_chroms = ('chrom', 'nunique'),
    n_tf = ('is_tf', 'sum'),
    n_pt = ('is_pt', 'sum'),
    mean_expression = ('expression', 'mean'),
).reset_index()

print(f"{edges.shape=}")

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/edges_{resolution}.parquet"
edges.to_parquet(outpath, index=False)
edges.head()

edges.shape=(2200583, 6)


Unnamed: 0,read_name,order,n_chroms,n_tf,n_pt,mean_expression
0,00000202-49cf-47b2-83bf-5eb3f6d98373,1,1,0,1,10.17
1,00000b61-7794-4b29-9f89-2b74e7bbce3e,3,3,0,3,61.916667
2,00001034-f0ab-41c2-885a-da17c2b836ae,2,2,0,2,0.295
3,0000131a-4f27-4dc5-839d-09720b024db9,1,1,0,1,12.84
4,00001afb-6b62-4d01-be02-9509485330ab,1,1,0,1,0.04


# Make and store incidence 

In [8]:
def make_incidence(df):
    df['value'] = 1
    df['read_index'] = df['read_name'].astype('category').cat.codes
    df['gene_index'] = df['gene_name'].astype('category').cat.codes
    
    data = df['value'].tolist()
    row = df['gene_index'].values
    col = df['read_index'].values
    
    n = df['gene_index'].nunique()
    m = df['read_index'].nunique()
    
    index = df['gene_name'].unique()
    columns = df['read_name'].unique()
    
    X = csr_matrix((data, (row, col)), shape=(n, m))
    X = csr_matrix((X > 0).astype(int))
    H = pd.DataFrame.sparse.from_spmatrix(
        X, index=index, columns=columns,
    )
    del X
    del data
    del row
    del col
    del index
    del columns
    return H

""" FILTER the hyperededges """
order_threshold = 2.0
H = df.copy()
H = H[H['is_pt']].reset_index(drop=True)
H['order'] = H.groupby('read_name')['gene_name'].transform('nunique')
H = H[H['order'] >= order_threshold].reset_index(drop=True)

"""  STRUCTURE the incidence matrix """
H = make_incidence(H)
print(f"{H.shape=}")
print(f"{type(H)=}")
print(f"DataFrame size: {H.memory_usage().sum() / 1024**2:.2f} MB")

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/protien_coding_incidence_{resolution}.pkl"
H.to_pickle(outpath)
H.head()

H.shape=(21278, 1089002)
type(H)=<class 'pandas.core.frame.DataFrame'>
DataFrame size: 35.84 MB


Unnamed: 0,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,ad5b2240-893f-4ed0-a157-c2be66d8d754,3b213af3-2882-4f23-887e-ecfc864e0f17,5f9cdb29-7ddc-4d9a-93ee-e82edb44a08d,52a1f5d7-c3fc-4cce-bab6-06f8cda9f4bd,d7dbae9b-5deb-4a0f-8f97-eb5101d949cd,4d19c76d-2583-4308-b495-77f34035ac87,e6f27aaf-9cfa-4bf0-b3e4-c81bc946d811,7b85fb37-32dc-4a77-ad2c-26b8c6e57ec3,...,6f6855ef-e494-4a8a-bd4e-7bb05c916950,401c8d32-d3bb-4c71-816e-3581a27f3b9e,0c1c9019-79e9-4d78-84f8-ab977c1b976a,ea773a8a-e760-4bc3-9f3e-bb42e980b45c,3a1950eb-15c2-4b46-b1f4-cf988afc9af8,29884a9e-228d-456a-9869-c663e2b4cb86,dfbc441f-bff3-4972-8ac6-fa69454347a3,a89b8cf7-8579-4bd3-ad1f-a6e4295ff8cc,e13378e4-38f6-4bac-92e8-d849bbd1c764,ea4352dd-04b6-46ae-aae3-0f1af9ec1b1a
Ulk4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Smarca2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dlgap3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tmem267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Atg4a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Define and save the core

In [9]:
tf = 1
d = 2
t = 2.0

"""criteria:
    1. one TF
    2. d protien-coding genes in the core with expression over t
    3. multiple-chromosomes
"""

core = df.copy()
print(f"(raw) {core.shape=}")
core = core[core['expression'].notna()]
core = core[core['is_pt']]
core['is_expressed'] = core['expression'] > t

core = core.groupby('read_name').agg(
    chroms = ('chrom', 'nunique'),
    tf = ('is_tf', 'sum'),
    core = ('is_core', 'sum'),
    expressed = ('is_expressed', 'sum'),
    mean_expression = ('expression', 'mean'),
).reset_index()

print(f"(grouped) {core.shape=}")

core = core[core['tf'] >= tf]
core = core[core['core'] >= d]
core = core[core['expressed'] >= t]

print(f"(filtered) {core.shape=}")


outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/core_edges_{resolution}.parquet"
core.to_parquet(outpath, index=False)
core.head()

(raw) core.shape=(4939267, 19)
(grouped) core.shape=(2021658, 6)
(filtered) core.shape=(38582, 6)


Unnamed: 0,read_name,chroms,tf,core,expressed,mean_expression
28,0000c863-d9dd-4589-8307-614a9f5db5b1,4,1,2,2,7.832
90,0003060b-d552-48f5-a116-6219ff00b425,4,1,2,4,3.206
166,00055dd1-0900-4c4c-986f-2d31b8093830,1,1,2,2,6.9375
180,00059de3-2ef6-47be-b133-27b1dbb5d212,4,3,2,4,87.314
217,0006c00f-8c61-4d94-946b-8b056e4a292c,3,1,3,3,16.566


In [10]:
H = df[df['read_name'].isin(core['read_name'].values)].copy()

"""  STRUCTURE the incidence matrix """
H = make_incidence(H)
print(f"{H.shape=}")
print(f"{type(H)=}")
print(f"DataFrame size: {H.memory_usage().sum() / 1024**2:.2f} MB")

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/core_incidence_{resolution}.pkl"
H.to_pickle(outpath)
H.head()

H.shape=(24415, 38582)
type(H)=<class 'pandas.core.frame.DataFrame'>
DataFrame size: 2.56 MB


Unnamed: 0,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,c160a170-5af7-412b-9c03-36dfef017384,a8241f5d-1189-49c9-a948-c6a1c91cdf17,3b0686b6-f18f-495d-89f5-8c8b286c2bb1,77bc1796-a0a3-4140-a97b-d4a786d17cb2,4f913e8a-799a-488e-a7bd-6ae9566e5c37,da8a0dfa-5deb-48c3-bf6e-bde5534e0578,a425bdc9-37ea-4020-bc7c-5085fb99a3c7,8ad994fd-c214-46f9-99b0-37c2b3f2946e,b11c9d23-365a-44f5-ace6-217680ce31e8,...,6e74d129-3498-42b0-ade9-0d35f60543af,eae8359f-2057-4492-93d1-10437e892f0b,01f1812f-b821-493c-bc17-3e5e6d453f2c,fcf0a060-2833-4ff0-a352-d5e702f27f46,6199d009-7ef7-44f6-b10c-c0ac846f362c,06857eb1-74b5-484c-9eb3-b0ced3498887,689b7cde-4f15-4869-bda9-bb49cf4e5676,9f9a0a4e-630b-406a-bb24-7026948c9787,acfb51fd-375f-42ac-839c-326616ea1953,de0a50ba-dade-46ba-9ed2-583b92b669b2
Smarca2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sh2d5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ppp1r15b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dyrk1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Igf2bp3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
order_threshold = 3.0

H = df[df['read_name'].isin(core['read_name'].values)].copy()
H = H[H['is_pt']].reset_index(drop=True)
H['order'] = H.groupby('read_name')['gene_name'].transform('nunique')
H = H[H['order'] >= order_threshold].reset_index(drop=True)


"""  STRUCTURE the incidence matrix """
H = make_incidence(H)
print(f"{H.shape=}")
print(f"{type(H)=}")
print(f"DataFrame size: {H.memory_usage().sum() / 1024**2:.2f} MB")

outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/transcription_clusters/core_incidence_{resolution}_protien_coding_only.pkl"
H.to_pickle(outpath)
H.head()

H.shape=(17186, 34592)
type(H)=<class 'pandas.core.frame.DataFrame'>
DataFrame size: 2.17 MB


Unnamed: 0,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,c160a170-5af7-412b-9c03-36dfef017384,3b0686b6-f18f-495d-89f5-8c8b286c2bb1,77bc1796-a0a3-4140-a97b-d4a786d17cb2,4f913e8a-799a-488e-a7bd-6ae9566e5c37,da8a0dfa-5deb-48c3-bf6e-bde5534e0578,a425bdc9-37ea-4020-bc7c-5085fb99a3c7,8ad994fd-c214-46f9-99b0-37c2b3f2946e,2ea6e55f-cc78-418a-b241-f134009153a0,05790af8-be74-4b99-8d1b-49074fa8f81d,...,05fc8d13-3610-4bb0-b173-a908dd526cdd,925b2134-befc-44e1-a9c5-97ca1295c96c,2f5a483f-7f31-4028-b91e-8f8c83a5b922,43cef5b9-05dc-4d4a-b20b-349a91ae2224,49c8ea45-81cb-4f76-a7ad-bd753e8f8c7c,eae8359f-2057-4492-93d1-10437e892f0b,fcf0a060-2833-4ff0-a352-d5e702f27f46,6199d009-7ef7-44f6-b10c-c0ac846f362c,0e705646-2f32-40f9-bad9-fbf5f7ef9d79,9f9a0a4e-630b-406a-bb24-7026948c9787
Smarca2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sh2d5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dyrk1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Igf2bp3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tmem267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
