## Running the full workflow on scimilarity embeddings

In [1]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr, f_oneway
from statsmodels.stats.multitest import multipletests
import cellxgene_census
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import issparse

# GSEAPY and decoupler (DoRothEA / pySCENIC)
import gseapy as gp
from decoupler.op import dorothea
from decoupler.mt import viper

# 1. Choose a Census version and organism
ORGANISM = "homo_sapiens"
MEASUREMENT = "RNA"
CENSUS_VERSION = "2025-01-30"

SAMPLE_SIZE = 5000

EMBEDDING_NAME = "geneformer"

METADATA_FIELDS = [
    "assay",
    "dataset_id",
    "cell_type",
    "development_stage",
    "disease",
    "self_reported_ethnicity",
    "sex",
    "tissue_general",
    "tissue",
    "soma_joinid"  # Need this for joining with expression data
]

DATA_PATH = "../data/external/embeddings_scimilarity.h5ad"
INDICES = np.load("../data/external/test_indices.npy", allow_pickle=True)

  from .autonotebook import tqdm as notebook_tqdm


### Generate annotations

In [4]:
# load anndata
adata = sc.read_h5ad(DATA_PATH)

In [5]:
# subset adata to observations at INDICES
adata = adata[INDICES]
adata

View of AnnData object with n_obs × n_vars = 50000 × 28231
    obs: 'assay', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'tissue', 'tissue_general', 'n_measured_vars'
    uns: 'orig_genes'
    obsm: 'X_emb'
    layers: 'counts'

In [7]:
# add metadata
annotations_df = pd.DataFrame(index=adata.obs_names)

for field in METADATA_FIELDS:
    if field in adata.obs.columns:
        annotations_df[field] = adata.obs[field]

In [8]:
# technical metrics
if issparse(adata.X):
    X = adata.X.toarray()
else:
    X = adata.X

annotations_df['n_counts'] = X.sum(axis=1)
annotations_df['n_genes'] = (X > 0).sum(axis=1)

mito_genes = adata.var_names.str.upper().str.startswith("MT-")
annotations_df['pct_mito'] = X[:, mito_genes].sum(axis=1) / annotations_df['n_counts'] * 100

ribo_genes = adata.var_names.str.startswith(("RPS","RPL"))
annotations_df['pct_ribo'] = X[:, ribo_genes].sum(axis=1) / annotations_df['n_counts'] * 100

In [9]:
hallmark_genesets = gp.get_library(name='MSigDB_Hallmark_2020', organism='Human')

2025-11-05 13:41:26 | [INFO] Downloading and generating Enrichr library gene sets...
2025-11-05 13:41:26 | [INFO] Library is already downloaded in: /home/amoneim/.cache/gseapy/Enrichr.MSigDB_Hallmark_2020.gmt, use local file


In [10]:
# cell cycle
# S-phase → E2F targets, G2/M → G2M checkpoint
s_genes = [g for g in hallmark_genesets['E2F Targets'] if g in adata.var_names]
g2m_genes = [g for g in hallmark_genesets['G2-M Checkpoint'] if g in adata.var_names]

if len(s_genes) == 0 or len(g2m_genes) == 0:
    raise ValueError("No S/G2M genes from Hallmark found in adata.var_names.")

sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes, copy=False)
annotations_df['S_score'] = adata.obs['S_score']
annotations_df['G2M_score'] = adata.obs['G2M_score']
annotations_df['phase'] = adata.obs['phase']

  adata.obs[score_name] = pd.Series(


In [None]:
# additional biological programs via ssGSEA
marker_sets = {
    "Inflammation Response": hallmark_genesets['Inflammatory Response'],
    "Hypoxia": hallmark_genesets['Hypoxia']
}

expr_df = pd.DataFrame(X.T, index=adata.var_names, columns=adata.obs_names)

for name, genes in marker_sets.items():
    print(f"Calculating score for {name}...")
    # This is extremely fast and works on sparse data
    sc.tl.score_genes(
        adata,
        gene_list=genes,
        score_name=name,
        use_raw=False
    )

    annotations_df[name] = adata.obs[name]

print("All gene scores calculated and added.")

Calculating score for Inflammation Response...
Calculating score for Hypoxia...
All gene scores calculated and added.


In [12]:
annotations_df

Unnamed: 0,assay,cell_type,development_stage,tissue_general,tissue,n_counts,n_genes,pct_mito,pct_ribo,S_score,G2M_score,phase,Inflammation Response,Hypoxia
125421,10x 3' v3,endothelial cell of lymphatic vessel,sixth decade stage,lung,lung parenchyma,4372.451047,4677,0.647219,3.918741,-0.064049,0.057220,G2M,-0.016009,0.176030
348609,10x 3' v3,hepatic stellate cell,unknown,liver,liver,4399.495421,5029,0.000000,0.000000,0.105877,0.170911,G2M,-0.008083,0.198231
193834,10x 3' v2,early lymphoid progenitor,15th week post-fertilization stage,bone marrow,bone marrow,2785.743149,2682,0.937140,10.999956,0.258803,0.190675,S,-0.051502,0.059599
103147,10x 3' v3,corticothalamic-projecting glutamatergic corti...,adult stage,brain,neocortex,4191.821643,5889,0.178932,0.289241,-0.110893,-0.003373,G1,-0.083046,-0.034153
285859,10x 5' v2,transit amplifying cell,seventh decade stage,large intestine,rectum,1997.553873,1174,3.343266,9.574379,-0.001206,0.047865,G2M,-0.001143,0.082314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30449,10x 3' v3,paneth cell,26-year-old stage,intestine,intestinal mucosa,3296.109654,4175,1.374060,8.601619,0.079751,0.124962,G2M,-0.053266,0.033455
26077,10x 3' v2,transit amplifying cell of colon,adult stage,colon,colonic epithelium,2617.344324,2617,1.958384,9.318971,-0.010553,-0.041089,G1,-0.055169,0.161565
66260,10x 3' v3,endocardial cell,10th week post-fertilization stage,heart,basal zone of heart,2796.461254,1975,0.868666,10.306040,0.079304,0.085201,G2M,-0.052676,0.061323
174626,10x 3' v3,fallopian tube secretory epithelial cell,43-year-old stage,fallopian tube,fimbria of fallopian tube,2757.101555,2651,1.609990,10.808211,-0.026154,0.007163,G2M,0.006595,0.124701


In [13]:
# add to adata.obs
for col in annotations_df.columns:
    adata.obs[col] = annotations_df[col]

In [14]:
adata.obs

Unnamed: 0,assay,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,tissue,tissue_general,n_measured_vars,S_score,G2M_score,phase,Inflammation Response,Hypoxia,n_counts,n_genes,pct_mito,pct_ribo
125421,10x 3' v3,endothelial cell of lymphatic vessel,CL:0002138,sixth decade stage,HsapDv:0000240,lung parenchyma,lung,36869,-0.064049,0.057220,G2M,-0.016009,0.176030,4372.451047,4677,0.647219,3.918741
348609,10x 3' v3,hepatic stellate cell,CL:0000632,unknown,unknown,liver,liver,25484,0.105877,0.170911,G2M,-0.008083,0.198231,4399.495421,5029,0.000000,0.000000
193834,10x 3' v2,early lymphoid progenitor,CL:0000936,15th week post-fertilization stage,HsapDv:0000052,bone marrow,bone marrow,32839,0.258803,0.190675,S,-0.051502,0.059599,2785.743149,2682,0.937140,10.999956
103147,10x 3' v3,corticothalamic-projecting glutamatergic corti...,CL:4023013,adult stage,HsapDv:0000258,neocortex,brain,18786,-0.110893,-0.003373,G1,-0.083046,-0.034153,4191.821643,5889,0.178932,0.289241
285859,10x 5' v2,transit amplifying cell,CL:0009010,seventh decade stage,HsapDv:0000241,rectum,large intestine,33145,-0.001206,0.047865,G2M,-0.001143,0.082314,1997.553873,1174,3.343266,9.574379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30449,10x 3' v3,paneth cell,CL:0000510,26-year-old stage,HsapDv:0000120,intestinal mucosa,intestine,33118,0.079751,0.124962,G2M,-0.053266,0.033455,3296.109654,4175,1.374060,8.601619
26077,10x 3' v2,transit amplifying cell of colon,CL:0009011,adult stage,HsapDv:0000258,colonic epithelium,colon,60528,-0.010553,-0.041089,G1,-0.055169,0.161565,2617.344324,2617,1.958384,9.318971
66260,10x 3' v3,endocardial cell,CL:0002350,10th week post-fertilization stage,HsapDv:0000047,basal zone of heart,heart,27411,0.079304,0.085201,G2M,-0.052676,0.061323,2796.461254,1975,0.868666,10.306040
174626,10x 3' v3,fallopian tube secretory epithelial cell,CL:4030006,43-year-old stage,HsapDv:0000137,fimbria of fallopian tube,fallopian tube,60054,-0.026154,0.007163,G2M,0.006595,0.124701,2757.101555,2651,1.609990,10.808211


In [15]:
# save adata
adata.write("../data/external/annotated_test.h5ad")

... storing 'phase' as categorical


### Combine annotations with sparse features

In [17]:
ANNOTATED_ADATA_FILE = "../data/external/annotated_test.h5ad"
SPARSE_FEATURES = "../data/external/embd_sparse.npz"

In [20]:
# load sparse matrix
from scipy import sparse

# Load the sparse matrix
mat = sparse.load_npz(SPARSE_FEATURES)

In [21]:
mat

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 6321654 stored elements and shape (50000, 10280)>

In [22]:
# add mat as an obsm
adata.obsm['sparse_features'] = mat

In [23]:
# save anndata
adata.write("../data/external/annotated_test_with_sparse_v1.h5ad")

### Analyze sparse features

In [24]:
DATA_PATH = "../data/external/annotated_test_with_sparse_v1.h5ad"
adata = sc.read_h5ad(DATA_PATH)

In [25]:
adata

AnnData object with n_obs × n_vars = 50000 × 28231
    obs: 'assay', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'tissue', 'tissue_general', 'n_measured_vars', 'S_score', 'G2M_score', 'phase', 'Inflammation Response', 'Hypoxia', 'n_counts', 'n_genes', 'pct_mito', 'pct_ribo'
    uns: 'orig_genes'
    obsm: 'X_emb', 'sparse_features'
    layers: 'counts'

In [29]:
# drop cell_type_ontology_term_id, development_stage, development_stage_ontology_term_id from obs
adata.obs = adata.obs.drop(columns=[
    "cell_type_ontology_term_id",
    "development_stage",
    "development_stage_ontology_term_id"
])

In [28]:
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm

# --- Load feature activations ---
X = adata.obsm["sparse_features"]
if not isinstance(X, np.ndarray):
    X = X.toarray()

# --- Optional thresholding (e.g. binarize or zero small activations) ---
X_thr = (X > np.percentile(X, 75, axis=0)).astype(float)  # or choose your own rule

# --- Get annotations ---
obs = adata.obs.copy()

# --- Separate continuous vs categorical variables ---
cont_vars, cat_vars = [], []
for c in obs.columns:
    dtype = obs[c].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        cont_vars.append(c)
    elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
        cat_vars.append(c)

print(f"Continuous variables ({len(cont_vars)}): {cont_vars}")
print(f"Categorical variables ({len(cat_vars)}): {cat_vars}")

results = []

# --- Correlate each feature with continuous variables ---
for var in tqdm(cont_vars, desc="Continuous variables"):
    y = obs[var].values
    for i in range(X_thr.shape[1]):
        x = X_thr[:, i]
        if np.std(x) == 0 or np.std(y) == 0:
            corr, p = np.nan, np.nan
        else:
            corr, p = stats.spearmanr(x, y)
        results.append({
            "feature": f"F{i}",
            "variable": var,
            "type": "continuous",
            "correlation": corr,
            "pval": p
        })

# --- Test categorical variables (ANOVA) ---
for var in tqdm(cat_vars, desc="Categorical variables"):
    groups = [X_thr[obs[var] == level, :] for level in obs[var].unique()]
    for i in range(X_thr.shape[1]):
        if any(len(g) < 2 for g in groups):
            F, p = np.nan, np.nan
        else:
            F, p = stats.f_oneway(*[g[:, i] for g in groups])
        results.append({
            "feature": f"F{i}",
            "variable": var,
            "type": "categorical",
            "correlation": F,
            "pval": p
        })

# --- Collect results ---
print("Collecting results...")
corr_df = pd.DataFrame(results)
corr_df["padj"] = stats.false_discovery_control(corr_df["pval"].fillna(1), method="bh")  # optional, requires scipy>=1.13

corr_df.sort_values("pval", inplace=True)
corr_df.head()


Continuous variables (9): ['n_measured_vars', 'S_score', 'G2M_score', 'Inflammation Response', 'Hypoxia', 'n_counts', 'n_genes', 'pct_mito', 'pct_ribo']
Categorical variables (8): ['assay', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'tissue', 'tissue_general', 'phase']


Continuous variables: 100%|██████████| 9/9 [03:02<00:00, 20.26s/it]
  res = hypotest_fun_out(*samples, **kwds)
Categorical variables:  12%|█▎        | 1/8 [02:28<17:22, 148.98s/it]


KeyboardInterrupt: 

In [32]:
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
from joblib import Parallel, delayed
from statsmodels.stats.multitest import multipletests
from tqdm_joblib import tqdm_joblib

# --- Load and threshold ---
X = adata.obsm["sparse_features"]
if not isinstance(X, np.ndarray):
    X = X.toarray()

X_thr = (X > np.percentile(X, 75, axis=0)).astype(float)
obs = adata.obs.copy()

# --- Separate continuous vs categorical variables ---
cont_vars, cat_vars = [], []
for c in obs.columns:
    dtype = obs[c].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        cont_vars.append(c)
    elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
        cat_vars.append(c)

print(f"Continuous: {len(cont_vars)}, Categorical: {len(cat_vars)}")

# --- Vectorized Spearman for continuous variables ---
def correlate_cont(var):
    y = obs[var].values
    if np.std(y) == 0:
        return pd.DataFrame({"feature": [], "variable": [], "type": [], "correlation": [], "pval": []})
    # rank-transform for Spearman
    y_rank = stats.rankdata(y)
    X_rank = np.apply_along_axis(stats.rankdata, 0, X_thr)
    corr = np.corrcoef(X_rank.T, y_rank)[-1, :-1]  # correlation of each feature with y_rank
    n = len(y)
    t_stat = corr * np.sqrt((n - 2) / (1 - corr**2))
    pval = 2 * stats.t.sf(np.abs(t_stat), df=n - 2)
    return pd.DataFrame({
        "feature": [f"F{i}" for i in range(X_thr.shape[1])],
        "variable": var,
        "type": "continuous",
        "correlation": corr,
        "pval": pval
    })

# --- Parallelized ANOVA for categorical variables ---
def correlate_cat(var):
    groups = [X_thr[obs[var] == level, :] for level in obs[var].dropna().unique()]
    valid = [len(g) >= 2 for g in groups]
    if not any(valid):
        return pd.DataFrame({"feature": [], "variable": [], "type": [], "correlation": [], "pval": []})
    F, p = stats.f_oneway(*[g for g in groups if len(g) >= 2])
    return pd.DataFrame({
        "feature": [f"F{i}" for i in range(X_thr.shape[1])],
        "variable": var,
        "type": "categorical",
        "correlation": F,
        "pval": p
    })

# --- Run everything in parallel ---
n_jobs = min(8, len(cont_vars) + len(cat_vars))  # adjust cores


with tqdm_joblib(tqdm(desc="Continuous vars", total=len(cont_vars))) as progress_bar:
    results_cont = Parallel(n_jobs=n_jobs)(
        delayed(correlate_cont)(var) for var in cont_vars
    )

with tqdm_joblib(tqdm(desc="Categorical vars", total=len(cat_vars))) as progress_bar:
    results_cat = Parallel(n_jobs=n_jobs)(
        delayed(correlate_cat)(var) for var in cat_vars
    )

results = results_cont + results_cat

# --- Combine and adjust p-values ---
corr_df = pd.concat(results, ignore_index=True)
corr_df["padj"] = multipletests(corr_df["pval"].fillna(1), method="fdr_bh")[1]
corr_df.sort_values("pval", inplace=True)
corr_df.head()


Continuous: 9, Categorical: 5


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 9/9 [01:52<00:00, 12.53s/it]

  res = hypotest_fun_out(*samples, axis=axis, **kwds)
  res = hypotest_fun_out(*samples, axis=axis, **kwds)
  res = hypotest_fun_out(*samples, axis=axis, **kwds)
  res = hypotest_fun_out(*samples, axis=axis, **kwds)
  res = hypotest_fun_out(*samples, axis=axis, **kwds)

[A
[A
[A
100%|██████████| 5/5 [00:12<00:00,  2.42s/it]


Unnamed: 0,feature,variable,type,correlation,pval,padj
120086,F7006,tissue,categorical,83.181706,0.0,0.0
114364,F1284,tissue,categorical,93.348736,0.0,0.0
114363,F1283,tissue,categorical,14.096185,0.0,0.0
114358,F1278,tissue,categorical,152.577023,0.0,0.0
114353,F1273,tissue,categorical,104.046814,0.0,0.0


In [46]:
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
from joblib import Parallel, delayed
from statsmodels.stats.multitest import multipletests
from tqdm_joblib import tqdm_joblib

# --- Assume 'adata' is loaded ---
# import anndata as ad
# adata = ad.read_h5ad("your_data.h5ad")
# ---------------------------------

# --- Load feature activations ---
X = adata.obsm["sparse_features"]
if not isinstance(X, np.ndarray):
    X = X.toarray().astype(np.float32)

obs = adata.obs.copy()

# --- Pre-filter sparse features (as requested) ---
feature_mask = X.std(0) > 1e-8
X_filtered = X[:, feature_mask]
print(f"Using {X_filtered.shape[1]} / {X.shape[1]} features after filtering")

# --- Store the names of the features that passed the filter ---
original_feature_names = np.array([f"F{i}" for i in range(X.shape[1])])
filtered_feature_names = original_feature_names[feature_mask]

# --- Separate continuous vs categorical variables ---
cont_vars, cat_vars = [], []
for c in obs.columns:
    dtype = obs[c].dtype
    if pd.api.types.is_numeric_dtype(dtype):
        cont_vars.append(c)
    elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
        cat_vars.append(c)

print(f"Continuous: {len(cont_vars)}, Categorical: {len(cat_vars)}")

# --- Pre-rank features for Spearman ---
# We already filtered constant columns, so no need to mask again
X_rank = np.apply_along_axis(stats.rankdata, 0, X_filtered).astype(np.float32)
X_rank = (X_rank - X_rank.mean(0)) / X_rank.std(0, ddof=1)

# --- Vectorized Spearman correlation for continuous variables ---
def correlate_cont(var):
    y = obs[var].values
    if np.std(y) == 0:
        return pd.DataFrame({"feature": [], "variable": [], "type": [], "effect_size": [], "pval": []})
    
    # rank-transform and z-score
    y_rank = stats.rankdata(y).astype(np.float32)
    if np.std(y_rank) < 1e-8:
        return pd.DataFrame({"feature": [], "variable": [], "type": [], "effect_size": [], "pval": []})
    y_rank = (y_rank - y_rank.mean()) / y_rank.std(ddof=1)
    
    corr = (X_rank.T @ y_rank) / (len(y_rank) - 1)
    
    # compute p-values
    n = len(y_rank)
    # add epsilon to avoid division by zero in (1 - corr**2)
    t_stat = corr * np.sqrt((n - 2) / (1 - corr**2 + 1e-12))
    pval = 2 * stats.t.sf(np.abs(t_stat), df=n - 2)
    
    return pd.DataFrame({
        "feature": filtered_feature_names,
        "variable": var,
        "type": "continuous",
        "effect_size": corr,  # Renamed from 'correlation'
        "pval": pval
    })

# --- Parallelized ANOVA for categorical variables ---
def correlate_cat(var):
    levels = obs[var].dropna().unique()
    
    # Use the pre-filtered X_filtered, not X
    groups = [X_filtered[obs[var] == level, :] for level in levels]
    valid = [len(g) >= 2 for g in groups]
    
    valid_groups = [g for g, v in zip(groups, valid) if v]
    
    if len(valid_groups) < 2: # Need at least 2 groups for ANOVA
        return pd.DataFrame({"feature": [], "variable": [], "type": [], "effect_size": [], "pval": []})
    
    F, p = stats.f_oneway(*valid_groups)
    
    # --- Calculate Eta-squared (η²) ---
    k = len(valid_groups) # number of groups
    N = sum(len(g) for g in valid_groups) # total number of samples
    
    df_between = k - 1
    df_within = N - k
    
    # Calculate eta-squared
    # Add epsilon to denominator to avoid division by zero
    eta_squared = (F * df_between) / (F * df_between + df_within + 1e-12)
    
    # Handle potential NaNs if F is NaN (e.g., all groups identical)
    eta_squared[np.isnan(F)] = 0
    p[np.isnan(F)] = 1
    
    return pd.DataFrame({
        "feature": filtered_feature_names,
        "variable": var,
        "type": "categorical",
        "effect_size": eta_squared, # <-- Storing η² instead of F
        "pval": p
    })

# --- Run everything in parallel with progress bars ---
n_jobs = min(8, len(cont_vars) + len(cat_vars))  # adjust cores

with tqdm_joblib(tqdm(desc="Continuous vars", total=len(cont_vars))) as progress_bar:
    results_cont = Parallel(n_jobs=n_jobs)(
        delayed(correlate_cont)(var) for var in cont_vars
    )

with tqdm_joblib(tqdm(desc="Categorical vars", total=len(cat_vars))) as progress_bar:
    results_cat = Parallel(n_jobs=n_jobs)(
        delayed(correlate_cat)(var) for var in cat_vars
    )

results = results_cont + results_cat

# --- Combine results and adjust p-values ---
corr_df = pd.concat(results, ignore_index=True)

# Handle any potential NaNs in pval before multitest
corr_df["pval"] = corr_df["pval"].fillna(1)

corr_df["padj"] = multipletests(corr_df["pval"], method="fdr_bh")[1]
corr_df.sort_values("pval", inplace=True)

print("\n--- Correlation Results ---")
print(corr_df.head())

# You can now sort by absolute effect size to find the strongest relationships
corr_df["abs_effect_size"] = corr_df["effect_size"].abs()
print("\n--- Strongest Relationships (by Effect Size) ---")
print(corr_df.sort_values("abs_effect_size", ascending=False).head())

Using 3348 / 10280 features after filtering
Continuous: 9, Categorical: 5




[A[A

[A[A

100%|██████████| 9/9 [00:01<00:00,  8.08it/s]


[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


100%|██████████| 5/5 [00:04<00:00,  1.21it/s]


--- Correlation Results ---
      feature        variable         type  effect_size  pval  padj
42240   F6308  tissue_general  categorical     0.228970   0.0   0.0
31266   F3409           assay  categorical     0.053440   0.0   0.0
34251   F2292       cell_type  categorical     0.091380   0.0   0.0
38984   F6624          tissue  categorical     0.138402   0.0   0.0
38979   F6612          tissue  categorical     0.075330   0.0   0.0

--- Strongest Relationships (by Effect Size) ---
      feature   variable         type  effect_size  pval  padj  \
18039   F3925   n_counts   continuous    -0.944450   0.0   0.0   
34738   F3810  cell_type  categorical     0.925595   0.0   0.0   
36558   F9398  cell_type  categorical     0.925019   0.0   0.0   
36576   F9474  cell_type  categorical     0.922548   0.0   0.0   
34978   F4533  cell_type  categorical     0.922091   0.0   0.0   

       abs_effect_size  
18039         0.944450  
34738         0.925595  
36558         0.925019  
36576         0.




In [60]:
corr_df

Unnamed: 0,feature,variable,type,effect_size,pval,padj,abs_effect_size
42240,F6308,tissue_general,categorical,0.228970,0.0,0.0,0.228970
31266,F3409,assay,categorical,0.053440,0.0,0.0,0.053440
34251,F2292,cell_type,categorical,0.091380,0.0,0.0,0.091380
38984,F6624,tissue,categorical,0.138402,0.0,0.0,0.138402
38979,F6612,tissue,categorical,0.075330,0.0,0.0,0.075330
...,...,...,...,...,...,...,...
39163,F7169,tissue,categorical,0.001486,1.0,1.0,0.001486
39150,F7136,tissue,categorical,0.000927,1.0,1.0,0.000927
36997,F496,tissue,categorical,0.000269,1.0,1.0,0.000269
36943,F345,tissue,categorical,0.000520,1.0,1.0,0.000520


In [50]:
significant_hits = corr_df[corr_df['padj'] < 0.01].copy()

# Sort by strongest absolute relationship
significant_hits['abs_effect_size'] = significant_hits['effect_size'].abs()
significant_hits.sort_values('abs_effect_size', ascending=False, inplace=True)

significant_hits

Unnamed: 0,feature,variable,type,effect_size,pval,padj,abs_effect_size
18039,F3925,n_counts,continuous,-0.944450,0.000000,0.000000,0.944450
34738,F3810,cell_type,categorical,0.925595,0.000000,0.000000,0.925595
36558,F9398,cell_type,categorical,0.925019,0.000000,0.000000,0.925019
36576,F9474,cell_type,categorical,0.922548,0.000000,0.000000,0.922548
34978,F4533,cell_type,categorical,0.922091,0.000000,0.000000,0.922091
...,...,...,...,...,...,...,...
32386,F6899,assay,categorical,0.000202,0.006363,0.009788,0.000202
31071,F2834,assay,categorical,0.000202,0.006444,0.009910,0.000202
31978,F5616,assay,categorical,0.000202,0.006454,0.009924,0.000202
31471,F4041,assay,categorical,0.000202,0.006474,0.009953,0.000202


In [55]:
significant_hits[significant_hits['feature'] == 'F3925']

Unnamed: 0,feature,variable,type,effect_size,pval,padj,abs_effect_size
18039,F3925,n_counts,continuous,-0.94445,0.0,0.0,0.94445
21387,F3925,n_genes,continuous,-0.871896,0.0,0.0,0.871896
28083,F3925,pct_ribo,continuous,0.572642,0.0,0.0,0.572642
24735,F3925,pct_mito,continuous,0.513016,0.0,0.0,0.513016
7995,F3925,G2M_score,continuous,-0.411689,0.0,0.0,0.411689
34779,F3925,cell_type,categorical,0.361433,0.0,0.0,0.361433
38127,F3925,tissue,categorical,0.308921,0.0,0.0,0.308921
4647,F3925,S_score,continuous,-0.235175,0.0,0.0,0.235175
14691,F3925,Hypoxia,continuous,-0.180366,0.0,0.0,0.180366
11343,F3925,Inflammation Response,continuous,0.147619,1.5094070000000002e-241,1.152826e-240,0.147619


In [61]:
import numpy as np
import pandas as pd

# --- Use your FULL, UNFILTERED corr_df ---
# (This is the DataFrame from before you made 'significant_hits')
df = corr_df.copy()

# 1. Make sure you have abs_effect_size
if 'abs_effect_size' not in df.columns:
    df['abs_effect_size'] = df['effect_size'].abs()

# 2. Sort by feature and effect size to find the top hits
df.sort_values(by=['feature', 'abs_effect_size'], ascending=[True, False], inplace=True)

# 3. Group by feature
g = df.groupby('feature')

# 4. Get the top hit (the best association for each feature)
top_hits_df = g.nth(0)

# 5. Get the second-best hit
second_hits_df = g.nth(1)

# 6. Create the summary table from the top hits
summary_df = top_hits_df.copy()

# 7. Add the second hit's effect size (will be NaN for features with only 1 hit)
summary_df['second_hit_effect_size'] = second_hits_df['abs_effect_size'].reindex(summary_df.index)

# 8. Fill NaNs with 0.0 (for features that genuinely only have one variable)
summary_df['second_hit_effect_size'].fillna(0.0, inplace=True)

# 9. Rename for clarity
summary_df.rename(columns={'abs_effect_size': 'top_hit_effect_size'}, inplace=True)

# 10. Calculate the robust specificity ratio
t = summary_df['top_hit_effect_size']
s = summary_df['second_hit_effect_size']

summary_df['specificity_ratio'] = np.where(
    s > 1e-12, # Is the second hit non-zero?
    t / s,     # Yes: normal division
    np.where(
        t > 1e-12, # No: Is the first hit non-zero?
        np.inf,    # Yes: (Hit / 0) -> Infinitely specific
        0.0        # No: (0 / 0) -> No hits at all
    )
)

# 11. --- THIS IS THE KEY ---
# Now, filter your summary table for significance
final_summary = summary_df[summary_df['padj'] < 0.01].copy()

# 12. Sort by your new, meaningful ratio
final_summary.sort_values('specificity_ratio', ascending=False, inplace=True)


# --- Display your final, sortable table ---
display_columns = [
    'variable',
    'top_hit_effect_size',
    'specificity_ratio', # <-- This will now have meaningful, finite numbers
    'padj',
    'second_hit_effect_size',
    'effect_size'
]

print("--- Final Feature Disentanglement Summary (Corrected Workflow) ---")
print(final_summary[display_columns].head(15))

--- Final Feature Disentanglement Summary (Corrected Workflow) ---
                    variable  top_hit_effect_size  specificity_ratio  \
33806              cell_type             0.046985                inf   
33481              cell_type             0.390588                inf   
10371  Inflammation Response             0.031163                inf   
36732              cell_type             0.083759                inf   
36733              cell_type             0.184346                inf   
40082                 tissue             0.161802                inf   
40083                 tissue             0.279795                inf   
37156                 tissue             0.045546                inf   
36736              cell_type             0.364037                inf   
36737              cell_type             0.118362                inf   
36738              cell_type             0.027133                inf   
36739              cell_type             0.021421                inf 

In [62]:
final_summary

Unnamed: 0,feature,variable,type,effect_size,pval,padj,top_hit_effect_size,second_hit_effect_size,specificity_ratio
33806,F999,cell_type,categorical,0.046985,4.573080e-263,3.637974e-262,0.046985,0.0,inf
33481,F10,cell_type,categorical,0.390588,0.000000e+00,0.000000e+00,0.390588,0.0,inf
10371,F1000,Inflammation Response,continuous,0.031163,3.176158e-12,7.296975e-12,0.031163,0.0,inf
36732,F10003,cell_type,categorical,0.083759,0.000000e+00,0.000000e+00,0.083759,0.0,inf
36733,F10006,cell_type,categorical,0.184346,0.000000e+00,0.000000e+00,0.184346,0.0,inf
...,...,...,...,...,...,...,...,...,...
6621,F10059,S_score,continuous,0.040650,9.616660e-20,2.636131e-19,0.040650,0.0,inf
36754,F10060,cell_type,categorical,0.553886,0.000000e+00,0.000000e+00,0.553886,0.0,inf
36756,F10063,cell_type,categorical,0.137031,0.000000e+00,0.000000e+00,0.137031,0.0,inf
36757,F10064,cell_type,categorical,0.025501,3.732737e-82,1.836666e-81,0.025501,0.0,inf
