In [1]:
import argparse
import logging
import sys
import re
from pathlib import Path
import numpy as np

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
plt.rcParams["font.family"] = "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
plt.rcParams['font.weight'] = 'normal'

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark.dea_utils import (
    add_group_sample_counts,
    add_ensembl_id,
    safe_sheet,
)

def get_args(test_args=None):
    p = argparse.ArgumentParser(description="Run single-cell-based wilxocon DE test")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument("seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model")
    p.add_argument("--sample_key", default="sample", help="obs column for donor/sample ID")
    p.add_argument("--subset_key", default="cell_type",
                        help="obs column used to subset data (e.g. cell_type, cluster, region)")
    p.add_argument("--subset_values", nargs="+", default=None, 
                        help="Values of subset_key to process (default: all unique values)")
    p.add_argument("--condition_key", default="genotype", help="obs column for condition (e.g. genotype)")
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument("--test_groups", nargs="+", default=None,
                        help="Groups to test vs reference (default: all groups in condition_key except --ref)")
    p.add_argument(
        "--overwrite", 
        type=lambda x: str(x).lower() in ["true","1","yes"],
        default=True,
        help="Overwrite existing result files (default: True)"
    )

    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "aging",
    "Cellpose_1_nuclei_model",
    "--subset_key", "cell_type",
    "--condition_key", "condition",
    "--ref", "WT_3",
])

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "foxf2",
    "Cellpose_1_nuclei_model",
    "--subset_key", "cell_type",
    "--condition_key", "genotype",
    "--ref", "WT",
])

In [3]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', ref='WT_3', test_groups=None, overwrite=True)

In [4]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False

In [5]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_revised"]

2025-10-29 17:13:58,819 [INFO]: Loading integrated AnnData...


In [7]:
# re-group cell types
adata.obs["cell_type"] = (
    adata.obs["cell_type"]
    .astype(str)
    .replace({
        "Tanycytes": "Ependymal",
        "Astroependymal": "Astrocytes",
        "Neurons-Glyc-Gaba" : "Neurons-Other"
    })
    .astype("category")
)

In [8]:
adata.obs["cell_type"].value_counts()

cell_type
Oligodendrocytes            131904
Neurons-Glut                124130
Neurons-Dopa                 77224
Astrocytes                   75424
ECs                          49431
Neurons-Granule-Immature     42337
Neurons-Other                40619
Undefined                    32012
Microglia                    29106
OPCs                         20832
Pericytes                    15056
VLMCs                         8346
Ependymal                     6539
SMCs                          5928
Choroid-Plexus                5657
Neurons-Gaba                  3499
BAMs                          1933
Immune-Other                   566
Name: count, dtype: int64

In [9]:
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
           if args.test_groups else
           f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'.")
    logger.info(msg)

# sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-10-29 17:21:41,619 [INFO]: Test groups inferred from 'condition': WT_6, WT_12, WT_18, WT_24
2025-10-29 17:21:42,612 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-10-29 17:21:42,643 [INFO]: WT_6: 3 samples → aging_s1_r0, aging_s5_r1, aging_s7_r2
2025-10-29 17:21:42,659 [INFO]: WT_12: 3 samples → aging_s5_r2, aging_s8_r0, aging_s12_r0
2025-10-29 17:21:42,667 [INFO]: WT_18: 3 samples → aging_s6_r0, aging_s8_r2, aging_s11_r0
2025-10-29 17:21:42,679 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [10]:
args.test_groups

['WT_6', 'WT_12', 'WT_18', 'WT_24']

In [11]:
args.ref

'WT_3'

In [12]:
adata.X = adata.layers['volume_log1p_norm'].copy()

In [13]:
del adata.varm
del adata.obsm
del adata.obsp

In [14]:
logger.info(f"Run DEA...")

2025-10-29 17:21:47,476 [INFO]: Run DEA...


In [15]:
adata_dict = {}
unique_groups = adata.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata[adata.obs[args.subset_key] == key].copy()
    adata_dict[key] = tmp
adata_dict["all"] = adata.copy()

In [16]:
adata_dict.keys()

dict_keys(['Neurons-Dopa', 'Astrocytes', 'Neurons-Granule-Immature', 'Pericytes', 'Undefined', 'VLMCs', 'OPCs', 'ECs', 'Oligodendrocytes', 'Neurons-Other', 'Microglia', 'Neurons-Gaba', 'SMCs', 'BAMs', 'Neurons-Glut', 'Ependymal', 'Choroid-Plexus', 'Immune-Other', 'all'])

In [38]:
# dissect DEA loop

In [16]:
group_i = "ECs"

In [17]:
adata_tmp = adata_dict[group_i]

In [18]:
adata_tmp

AnnData object with n_obs × n_vars = 46836 × 500
    obs: 'region', 'slide', 'cell_type_mmc_incl_low_quality_revised', 'cell_type_mmc_incl_low_quality_clusters', 'cell_type_mmc_incl_low_quality', 'cell_type_mmc_incl_mixed_revised', 'cell_type_mmc_incl_mixed_clusters', 'cell_type_mmc_incl_mixed', 'cell_type_mmc_raw_revised', 'cell_type_mmc_raw_clusters', 'cell_type_mmc_raw', 'area', 'volume_sum', 'volume_final', 'num_z_planes', 'size_normalized', 'surface_to_volume_ratio', 'sphericity', 'solidity', 'elongation', 'genotype', 'age_months', 'run_date', 'animal_id', 'organism', 'cohort', 'sample', 'n_counts', 'n_genes', 'condition', 'spatial_outlier', 'low_quality_cell', 'volume_outlier_cell', 'fov', 'cell_type'
    var: 'n_cells'
    uns: 'X_umap_20_50', 'X_umap_harmony_20_50', 'cell_type_mmc_raw_revised_colors', 'condition_colors', 'neighbors_harmony_20_50', 'neighbors_harmony_20_50_3D', 'pca', 'sample_colors', 'slide_colors'
    layers: 'counts', 'librarysize_log1p_norm', 'volume_log1p_n

In [19]:
adata_tmp.obs[args.condition_key].value_counts()

condition
WT_6      14040
PCKO_6    12755
ECKO_6    10246
GLKO_6     9795
Name: count, dtype: int64

In [43]:
sc.tl.rank_genes_groups(adata_tmp, groupby=args.condition_key, method='wilcoxon', reference=args.ref, corr_method='benjamini-hochberg')

In [44]:
dfs = []
for g in args.test_groups:
    df = sc.get.rank_genes_groups_df(adata_tmp, group=g).sort_values("pvals")
    df["test_group"] = g   # add group name
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).rename(columns={
    "names": "gene",
    "logfoldchanges": "log2FC",
    "pvals": "PValue",
    "pvals_adj": "FDR"
})
df["FC"] = np.power(2, df["log2FC"])
df["method"] = "wilcoxon_scanpy"

df = df[["gene", "FC", "log2FC", "PValue", "FDR", "scores", "method", "test_group"]]

In [45]:
df

Unnamed: 0,gene,FC,log2FC,PValue,FDR,scores,method,test_group
0,Tie1,0.528614,-0.919715,1.051443e-215,5.257217e-213,-31.347527,wilcoxon_scanpy,WT_18
1,Abcg2,0.575072,-0.798185,1.172368e-210,2.930919e-208,-30.974953,wilcoxon_scanpy,WT_18
2,Slc7a5,0.638004,-0.648363,7.882947e-163,1.313825e-160,-27.192888,wilcoxon_scanpy,WT_18
3,Mlc1,0.592040,-0.756234,2.026269e-161,2.532836e-159,-27.073393,wilcoxon_scanpy,WT_18
4,Jph3,0.612741,-0.706650,6.625877e-135,6.625877e-133,-24.719345,wilcoxon_scanpy,WT_18
...,...,...,...,...,...,...,...,...
1995,Loxl1,1.312980,0.392845,9.443953e-01,9.520114e-01,0.069747,wilcoxon_scanpy,WT_12
1996,Spp1,1.390382,0.475482,9.540696e-01,9.598286e-01,0.057597,wilcoxon_scanpy,WT_12
1997,Gjb2,1.343286,0.425767,9.688473e-01,9.724725e-01,0.039054,wilcoxon_scanpy,WT_12
1998,Twist1,1.393594,0.478810,9.705276e-01,9.724725e-01,0.036947,wilcoxon_scanpy,WT_12


In [None]:
# end

In [17]:
results = []
for subset, adata_tmp in adata_dict.items():
    
    valid_groups = set(adata_tmp.obs[args.condition_key].astype("category").cat.categories)

    if args.ref not in valid_groups:
        logger.warning(f"Skipping subset '{subset}': reference group '{args.ref}' not found.")
        continue
        
    sc.tl.rank_genes_groups(
        adata_tmp,
        groupby=args.condition_key,
        method="wilcoxon",
        reference=args.ref,
        corr_method="benjamini-hochberg",
    )

    take = [g for g in args.test_groups if g in valid_groups]

    dfs = []
    for g in take:
        df = sc.get.rank_genes_groups_df(adata_tmp, group=g).sort_values("pvals")
        df = df.copy()
        df["test_group"] = g
        dfs.append(df)

    if not dfs:
        continue

    df = (pd.concat(dfs, ignore_index=True)
            .rename(columns={"names":"gene","logfoldchanges":"log2FC","pvals":"PValue","pvals_adj":"FDR"}))

    df["FC"] = np.power(2, df["log2FC"])
    df["method"] = "wilcoxon_scanpy"
    df["subset"] = subset
    df["ref"] = args.ref

    df = df[["subset","gene","FC","log2FC","PValue","FDR","scores","method","test_group","ref"]]
    results.append(df)

df_all = pd.concat(results, ignore_index=True).sort_values(["subset","test_group","PValue"])


In [18]:
logger.info("Format output table...")
df_all["test"] = df_all["test_group"].astype(str) + "vs" + df_all["ref"].astype(str)
df_all = add_group_sample_counts(
    df_all, adata_dict,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset"
)

2025-10-29 17:32:19,896 [INFO]: Format output table...


In [19]:
logger.info("Add ensembl gene ids...")
df_all = add_ensembl_id(df_all, logger=logger)

2025-10-29 17:32:20,592 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['H2afj', 'Ctps', 'Pifo']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-10-29 17:32:25,204 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-10-29 17:32:25,222 [INFO]: Renamed aliases: H2afj->H2aj, Ctps->Ctps1, Pifo->Cimap3


In [20]:
df_all

Unnamed: 0,subset,gene,FC,log2FC,PValue,FDR,scores,method,test_group,ref,test,test_group_n,ref_n,ensembl_id
0,Astrocytes,Mlc1,0.553369,-0.853688,0.000000e+00,0.000000e+00,-42.685715,wilcoxon_scanpy,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000035805
1,Astrocytes,Gja1,0.681958,-0.552246,0.000000e+00,0.000000e+00,-48.192207,wilcoxon_scanpy,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000050953
2,Astrocytes,Cspg5,0.565842,-0.821528,0.000000e+00,0.000000e+00,-38.539268,wilcoxon_scanpy,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000032482
3,Astrocytes,Itgb1,0.583135,-0.778098,4.150397e-243,5.187996e-241,-33.297832,wilcoxon_scanpy,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000025809
4,Astrocytes,Cat,0.602218,-0.731643,8.231685e-193,8.231685e-191,-29.620113,wilcoxon_scanpy,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000027187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37495,all,Acan,1.050432,0.070982,9.883883e-01,9.957624e-01,0.014554,wilcoxon_scanpy,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000030607
37496,all,Adamts1,1.120328,0.163921,9.897878e-01,9.957624e-01,0.012799,wilcoxon_scanpy,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000022893
37497,all,Ang4,1.010466,0.015021,9.927173e-01,9.967041e-01,0.009128,wilcoxon_scanpy,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000060615
37498,all,Pdgfa,1.038046,0.053871,9.979921e-01,9.990011e-01,0.002517,wilcoxon_scanpy,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000025856


In [21]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r'[_-]', '', args.subset_key)
name = f"{args.cohort}-by-{subset_key_clean}"

for (method, test), df_mt in df_all.groupby(["method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{method}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")
        
logger.info(f"Done.")

2025-10-29 17:32:25,390 [INFO]: Export table(s)...
2025-10-29 17:32:36,122 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_wilcoxon_scanpy_WT_12vsWT_3.xlsx
2025-10-29 17:32:44,186 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_wilcoxon_scanpy_WT_18vsWT_3.xlsx
2025-10-29 17:32:50,890 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_wilcoxon_scanpy_WT_24vsWT_3.xlsx
2025-10-29 17:32:57,774 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_wilcoxon_scanpy_WT_6vsWT_3.xlsx
2025-10-29 17:32:57,868 [INFO]: Done.
