In [1]:
import argparse
import logging
import re
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import rpy2.robjects as ro
import anndata2ri
import rpy2.rinterface_lib.callbacks as rcb
from rpy2.robjects.conversion import localconverter
from rpy2.rinterface_lib.embedded import RRuntimeError

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
plt.rcParams["font.family"] = "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
plt.rcParams['font.weight'] = 'normal'

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark.adata_utils import plot_pseudobulk_pca
from cellseg_benchmark._constants import cell_type_colors
from cellseg_benchmark.dea_utils import (
    pseudobulk_aggregate_and_filter,
    prepare_adata_for_rpy2,
    add_group_sample_counts,
    add_ensembl_id,
    safe_sheet,
)

warnings.filterwarnings("ignore", message=".*Observation names are not unique*")

def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(description="DREAM pseudobulk-based DEA")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument("seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model")
    p.add_argument("--sample_key", default="sample", help="obs column for donor/sample ID")
    p.add_argument("--subset_key", default="cell_type",
                        help="obs column used to subset data (e.g. cell_type, cluster, region)")
    p.add_argument("--subset_values", nargs="+", default=None, 
                        help="Values of subset_key to process (default: all unique values)")
    p.add_argument("--condition_key", default="genotype", help="obs column for condition (e.g. genotype)")
    p.add_argument("--batch_key", default="slide", 
                       help="optional batch key for inclusion as covariate (default: slide)")
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument("--test_groups", nargs="+", default=None,
                        help="Groups to test vs reference (default: all groups in condition_key except --ref)")
    
    p.add_argument("--threads", default=4, help="number of threads for DREAM parallelization")
    p.add_argument("--min_cells", type=int, default=15, help="Minimum cells per donor")
    p.add_argument("--replicates_per_patient", type=int, default=1,
                        help="Number of pseudoreplicates per donor")
    p.add_argument(
        "--overwrite", 
        type=lambda x: str(x).lower() in ["true","1","yes"],
        default=True,
        help="Overwrite existing result files (default: True)"
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "foxf2",
    "Cellpose_1_nuclei_model",
    "--sample_key", "sample",
    "--subset_key", "cell_type",
    "--condition_key", "genotype",
    "--ref", "WT"
])

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "aging",
    "Cellpose_1_nuclei_model",
    "--sample_key", "sample",
    "--subset_key", "cell_type",
    "--condition_key", "condition",
    "--ref", "WT_3",
])

In [4]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', batch_key='slide', ref='WT_3', test_groups=None, threads=4, min_cells=15, replicates_per_patient=1, overwrite=True)

In [5]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False
rcb.logger.handlers = logger.handlers
rcb.consolewrite_print   = lambda x: logger.debug(f"R: {x.strip()}")
rcb.consolewrite_error   = lambda x: (_ for _ in ()).throw(RRuntimeError(x.strip()))
rcb.consolewrite_message = lambda x: logger.info(f"R: {x.strip()}")
rcb.consolewrite_warn = lambda x: (logger.warning if x.lstrip().lower().startswith("warning") else logger.info)(f"R: {x.strip()}")
setattr(rcb, "consolewrite_warnerror", rcb.consolewrite_warn)

In [6]:
# R setup
conv = ro.default_converter + ro.pandas2ri.converter + anndata2ri.converter
r_script = Path(sys.path[1]) / "cellseg_benchmark" / "dea_utils.r"
ro.r["source"](str(r_script))
edgeR_loop = ro.globalenv["edgeR_loop"]

In [7]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_revised"]

2025-10-29 16:43:58,347 [INFO]: Loading integrated AnnData...


In [9]:
# re-group cell types
adata.obs["cell_type"] = (
    adata.obs["cell_type"]
    .astype(str)
    .replace({
        "Tanycytes": "Ependymal",
        "Astroependymal": "Astrocytes",
        "Neurons-Glyc-Gaba" : "Neurons-Other"
    })
    .astype("category")
)

In [10]:
adata.obs["cell_type"].value_counts()

cell_type
Oligodendrocytes            131904
Neurons-Glut                124130
Neurons-Dopa                 77224
Astrocytes                   75424
ECs                          49431
Neurons-Granule-Immature     42337
Neurons-Other                40619
Undefined                    32012
Microglia                    29106
OPCs                         20832
Pericytes                    15056
VLMCs                         8346
Ependymal                     6539
SMCs                          5928
Choroid-Plexus                5657
Neurons-Gaba                  3499
BAMs                          1933
Immune-Other                   566
Name: count, dtype: int64

In [11]:
# Clean up group names for R conversion
adata.obs[args.subset_key] = [
    key.replace(" ", "_").replace("-", "_").replace("/", "_")
      .replace("+", "").replace("(", "").replace(")", "") 
    if not pd.isna(key) else key 
    for key in adata.obs[args.subset_key]
]
for col in [args.condition_key, args.sample_key, args.subset_key]:
    adata.obs[col] = adata.obs[col].astype("category")
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
           if args.test_groups else
           f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'.")
    logger.info(msg)
    
# print sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-10-29 16:51:29,626 [INFO]: Test groups inferred from 'condition': WT_6, WT_12, WT_18, WT_24
2025-10-29 16:51:30,783 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-10-29 16:51:30,811 [INFO]: WT_6: 3 samples → aging_s1_r0, aging_s5_r1, aging_s7_r2
2025-10-29 16:51:30,819 [INFO]: WT_12: 3 samples → aging_s5_r2, aging_s8_r0, aging_s12_r0
2025-10-29 16:51:30,831 [INFO]: WT_18: 3 samples → aging_s6_r0, aging_s8_r2, aging_s11_r0
2025-10-29 16:51:30,847 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [12]:
adata.obs['n_cells_sum'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)[args.sample_key].transform('count')
adata.obs['volume_mean'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)['volume_final'].transform('mean')
adata.obs['volume_sum'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)['volume_final'].transform('sum')

In [13]:
adata.X = adata.layers['counts'].copy()
assert np.issubdtype(adata.X.dtype, np.integer)

In [14]:
obs_to_keep = [
    args.condition_key,
    args.subset_key,
    args.sample_key,
    "n_cells_sum",
    "volume_mean",
    "volume_sum",
]
if args.batch_key and args.batch_key in adata.obs.columns:
    obs_to_keep.append(args.batch_key)
else:
    logger.warning(
        f"Batch column '{args.batch_key}' was not found in adata.obs "
        "or is invalid. Continuing **without** a batch covariate."
    )
    args.batch_key = None

In [15]:
logger.info(f"Run pseudobulking...")
total = len(groups_to_process)
logger.info(f"Processing {groups_to_process[0]} ({1}/{total})")
adata_pb = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=groups_to_process[0],
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger
)
for i, group in enumerate(groups_to_process[1:], 2):
    logger.info(f"Processing {group} ({i}/{total})")
    adata_pb_i = pseudobulk_aggregate_and_filter(
        adata,
        subset_value=group,
        sample_key=args.sample_key,
        subset_key=args.subset_key,
        obs_to_keep=obs_to_keep,
        min_cells=args.min_cells,
        replicates_per_patient=args.replicates_per_patient,
        logger=logger
    )
    adata_pb = ad.concat([adata_pb, adata_pb_i], join="outer", label=None, index_unique=None)
adata_pb.obs_names_make_unique()
# Add whole brain: pseudobulk per sample across all groups
adata_pb_i = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=None,
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger
)
adata_pb_i.obs[args.subset_key] = "all"
adata_pb = ad.concat([adata_pb, adata_pb_i])
del adata

2025-10-29 16:51:35,886 [INFO]: Run pseudobulking...
2025-10-29 16:51:35,902 [INFO]: Processing Astrocytes (1/18)
2025-10-29 16:51:46,618 [INFO]: 	Processing donor 1/15...
2025-10-29 16:52:10,807 [INFO]: 	Processing donor 15/15...
2025-10-29 16:52:12,311 [INFO]: Processing BAMs (2/18)
2025-10-29 16:52:15,360 [INFO]: 	Processing donor 1/14...
2025-10-29 16:52:30,511 [INFO]: 	Processing donor 14/14...
2025-10-29 16:52:31,752 [INFO]: Processing Choroid_Plexus (3/18)
2025-10-29 16:52:34,537 [INFO]: 	Processing donor 1/15...
2025-10-29 16:52:51,545 [INFO]: 	Processing donor 15/15...
2025-10-29 16:52:53,507 [INFO]: Processing ECs (4/18)
2025-10-29 16:53:04,601 [INFO]: 	Processing donor 1/15...
2025-10-29 16:53:31,329 [INFO]: 	Processing donor 15/15...
2025-10-29 16:53:34,914 [INFO]: Processing Ependymal (5/18)
2025-10-29 16:53:39,427 [INFO]: 	Processing donor 1/15...
2025-10-29 16:53:56,973 [INFO]: 	Processing donor 15/15...
2025-10-29 16:53:57,937 [INFO]: Processing Immune_Other (6/18)
2025

In [16]:
plot_pseudobulk_pca(adata_pb, args, output_dir, cell_type_colors, logger)

2025-10-29 17:03:10,730 [INFO]: Skipping PCA plot, file already exists: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/pseudobulk_pca.png


In [17]:
logger.info(f"Run DEA...")
adata_pb = prepare_adata_for_rpy2(adata_pb, key=args.subset_key)
adatas_pb = {}
unique_groups = adata_pb.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata_pb[adata_pb.obs[args.subset_key] == key].copy()
    adatas_pb[key] = tmp

2025-10-29 17:03:10,747 [INFO]: Run DEA...


In [15]:
# explore sce file in r
# skip when testing script!

In [16]:
%load_ext rpy2.ipython

In [17]:
args.test_groups

['GLKO', 'ECKO', 'PCKO']

In [18]:
group_i = "ECs"
adata = adatas_pb[group_i]
condition_col = args.condition_key
test_groups = args.test_groups
ref_group = args.ref
batch_col = args.batch_key
threads = args.threads

In [19]:
batch_col

'slide'

In [20]:
test_groups

['GLKO', 'ECKO', 'PCKO']

In [21]:
threads

4

In [22]:
adata.obs

Unnamed: 0,genotype,cell_type,slide,n_cells_sum,volume_mean,volume_sum,sample
donor_foxf2_s1_r0_0-4,ECKO,ECs,1,3770,1432.30689,5399797.0,foxf2_s1_r0
donor_foxf2_s1_r1_0-4,WT,ECs,1,3244,1441.239386,4675381.0,foxf2_s1_r1
donor_foxf2_s2_r1_0-4,WT,ECs,2,3709,1321.529465,4901553.0,foxf2_s2_r1
donor_foxf2_s2_r2_0-4,PCKO,ECs,2,3730,1422.418953,5305623.0,foxf2_s2_r2
donor_foxf2_s4_r0_0-3,PCKO,ECs,4,4136,1372.144486,5675190.0,foxf2_s4_r0
donor_foxf2_s4_r1_0-4,ECKO,ECs,4,2965,1478.098312,4382561.0,foxf2_s4_r1
donor_foxf2_s5_r0_0-4,PCKO,ECs,5,4889,1326.376653,6484655.0,foxf2_s5_r0
donor_foxf2_s5_r1_0-4,ECKO,ECs,5,3511,1172.656272,4117196.0,foxf2_s5_r1
donor_foxf2_s6_r0_0-4,GLKO,ECs,6,2986,1495.124579,4464442.0,foxf2_s6_r0
donor_foxf2_s6_r1_0-4,GLKO,ECs,6,3486,1512.162401,5271398.0,foxf2_s6_r1


In [23]:
#! mamba install bioconda::bioconductor-variancepartition -y


Looking for: ['bioconda::bioconductor-variancepartition']

conda-forge/linux-64                                        Using cache
conda-forge/noarch                                          Using cache
bioconda/linux-64                                           Using cache
bioconda/noarch                                             Using cache

Pinned packages:
  - python 3.12.*


Transaction

  Prefix: /home/ubuntu/miniforge3

  Updating specs:

   - bioconda::bioconductor-variancepartition
   - ca-certificates
   - certifi
   - openssl


  Package                           Version  Build          Channel        Size
─────────────────────────────────────────────────────────────────────────────────
  Reinstall:
─────────────────────────────────────────────────────────────────────────────────

  [32mo bioconductor-variancepartition[0m   1.36.2  r44hdfd78af_0  bioconda[32m     Cached[0m

  Summary:

  Reinstall: 1 packages

  Total download: 0 B

───────────────────────────────────

In [33]:
%%R
library("variancePartition")
library("edgeR")
library("BiocParallel")
library(data.table)

data.table 1.17.8 using 2 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: ‘data.table’

The following object is masked from ‘package:SummarizedExperiment’:

    shift

The following object is masked from ‘package:GenomicRanges’:

    shift

The following object is masked from ‘package:IRanges’:

    shift

The following objects are masked from ‘package:S4Vectors’:

    first, second



In [25]:
condition_col

'genotype'

In [26]:
%%R -c conv -i adata -i condition_col -i test_groups -i group_i -i ref_group -i batch_col -i threads -o design

condition <- droplevels(factor(colData(adata)[[condition_col]]))
if (is.null(condition) || nlevels(condition) < 2) {
    message(group_i, ": skip (condition invalid or <2 levels)"); return(NULL)
}

present <- intersect(test_groups, levels(condition))
if (!(ref_group %in% levels(condition)) || !length(present)) {
    message(group_i, ": skip (ref_group/test_groups missing)"); return(NULL)
}

missing <- setdiff(test_groups, present)
message(group_i, ": ", paste(present, collapse="/"), " vs ", ref_group,
        if (length(missing)) paste0(" (missing:", paste(missing, collapse=","), ")") else "")

param <- BiocParallel::SnowParam(as.numeric(threads), "SOCK", progressbar = T)



df <- as.data.frame(colData(adata))

condition <- droplevels(factor(colData(adata)[[condition_col]]))
if (!(ref_group %in% levels(condition))) stop("ref_group not found in condition levels.")
condition <- stats::relevel(condition, ref = ref_group)
if (levels(condition)[1L] != ref_group) stop("ref_group is not the first level after relevel().")
df$condition <- condition

# use batch only if it exists AND has >1 level
use_batch <- !is.null(batch_col) &&
             batch_col %in% colnames(colData(adata)) &&
             nlevels(droplevels(factor(colData(adata)[[batch_col]]))) > 1

if (use_batch) {
    df$batch <- droplevels(factor(df[[batch_col]]))
    form <- ~ condition + batch
    message("Design: ~ ", condition_col, " + ", batch_col)
} else {
    form <- ~ condition
    message("Design: ~ ", condition_col, if (!is.null(batch_col)) " (batch ignored: <2 levels or missing)" else "")
}

ECs: GLKO/ECKO/PCKO vs WT
Design: ~ genotype + slide


In [28]:
#%%R -o libsize -o vol_sum
#libsize <- colSums(SummarizedExperiment::assay(adata, "X"))
#vol_sum <- colData(adata)$volume_sum
#cor(libsize, vol_sum)

In [30]:
%%R -o s -o c
min_count = 2
y <- edgeR::DGEList(assay(adata, "X"), group = df$condition)

# replace library sizes with volume
vol <- colData(adata)[["volume_sum"]]
if (is.null(vol) || any(vol <= 0)) stop("Invalid volume in colData(sce), expects 'volume_sum'.")
y$samples$lib.size <- vol
y <- edgeR::calcNormFactors(y)

s <- y$samples

In [31]:
s

Unnamed: 0,group,lib.size,norm.factors
donor_foxf2_s1_r0_0-4,ECKO,5399797.0,1.326654
donor_foxf2_s1_r1_0-4,WT,4675381.0,1.33367
donor_foxf2_s2_r1_0-4,WT,4901553.0,0.871858
donor_foxf2_s2_r2_0-4,PCKO,5305623.0,0.914129
donor_foxf2_s4_r0_0-3,PCKO,5675190.0,1.290681
donor_foxf2_s4_r1_0-4,ECKO,4382561.0,1.403576
donor_foxf2_s5_r0_0-4,PCKO,6484655.0,0.78573
donor_foxf2_s5_r1_0-4,ECKO,4117196.0,0.59662
donor_foxf2_s6_r0_0-4,GLKO,4464442.0,0.975105
donor_foxf2_s6_r1_0-4,GLKO,5271398.0,0.851289


In [34]:
%%R -o dt_all
vobjDream <- variancePartition::voomWithDreamWeights(y, form, df, BPPARAM = param)

fitmm <- variancePartition::dream(vobjDream, form, df)
fitmm <- variancePartition::eBayes(fitmm)

res_list <- lapply(test_groups, function(grp) {
    tt <- variancePartition::topTable(
        fitmm,
        coef = paste0("condition", grp),
        number = Inf,
        sort.by = "p"
    )
    as.data.table(tt, keep.rownames = "gene")[, test_group := grp]
})
dt_all <- rbindlist(res_list)

In [35]:
dt_all

Unnamed: 0,gene,logFC,AveExpr,t,P.Value,adj.P.Val,B,test_group
1,Foxf2,-2.890146,9.004545,-12.591761,0.000002,0.001092,5.207381,GLKO
2,Slc16a1,-0.791860,10.476767,-7.039470,0.000134,0.033544,1.623069,GLKO
3,Depp1,0.775995,8.260951,5.792303,0.000482,0.080380,0.392768,GLKO
4,Atp13a5,-0.806453,7.167773,-4.311647,0.002856,0.292346,-1.344844,GLKO
5,Abcg2,0.518439,10.808551,4.293942,0.002923,0.292346,-1.466834,GLKO
...,...,...,...,...,...,...,...,...
1496,Flrt2,-0.002631,8.413505,-0.010844,0.991625,0.999043,-6.056041,PCKO
1497,Icam1,-0.003221,7.277082,-0.009001,0.993049,0.999043,-5.965048,PCKO
1498,Rad54b,-0.000727,7.183502,-0.004880,0.996231,0.999294,-5.985783,PCKO
1499,Spp1,-0.000469,6.090952,-0.001511,0.998833,0.999294,-5.880505,PCKO


In [None]:
# end

In [18]:
all_degs = {}
for group_i, adata_tmp in adatas_pb.items():
    with localconverter(conv):
        res = ro.r['dream_loop'](
            adata=adata_tmp,
            group_i=group_i,
            test_groups=ro.StrVector(args.test_groups),
            ref_group=args.ref,
            condition_col=args.condition_key,
            batch_col=(args.batch_key if args.batch_key is not None else ro.NULL),
            threads=args.threads,
            min_count=2
        )
    if res is not ro.NULL:
        all_degs[group_i] = res

2025-10-29 17:03:30,492 [INFO]: R: Astrocytes: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:05:33,488 [INFO]: R: BAMs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:06:40,545 [INFO]: R: Choroid_Plexus: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:06:45,008 [INFO]: R: ECs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:06:51,992 [INFO]: R: Ependymal: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:07:26,106 [INFO]: R: Immune_Other: skip (ref_group/test_groups missing)
2025-10-29 17:07:26,287 [INFO]: R: Microglia: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:07:57,213 [INFO]: R: Neurons_Dopa: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:08:09,777 [INFO]: R: Neurons_Gaba: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:08:14,478 [INFO]: R: Neurons_Glut: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:08:15,849 [INFO]: R: Neurons_Granule_Immature: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:08:34,967 [INFO]: R: Neurons_Other: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 17:08:57,551 [INFO]: R: OPCs: WT_

In [19]:
logger.info("Format output table...")
collapsed_df = (
    pd.concat([pd.DataFrame(v).assign(subset=k) for k, v in all_degs.items()],
              ignore_index=True)
    .set_index("gene", drop=False)
)

front = ["subset", "gene", "FC", "logFC", "PValue", "FDR", "AveExpr", "t", "B"]
order = [c for c in front if c in collapsed_df.columns]
rest  = [c for c in collapsed_df.columns if c not in order]
collapsed_df = collapsed_df[order + rest]

collapsed_df = add_group_sample_counts(
    collapsed_df, adatas_pb,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset"
)

logger.info("Add ensembl gene ids...")
collapsed_df = add_ensembl_id(collapsed_df, logger=logger)

2025-10-29 17:10:38,823 [INFO]: Format output table...
2025-10-29 17:10:39,007 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['Pifo', 'Ctps', 'H2afj']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-10-29 17:10:43,431 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-10-29 17:10:43,432 [INFO]: Renamed aliases: Pifo->Cimap3, Ctps->Ctps1, H2afj->H2aj


In [20]:
collapsed_df

Unnamed: 0,subset,gene,FC,logFC,PValue,FDR,AveExpr,t,B,method,test_group,ref,test,test_group_n,ref_n,ensembl_id
0,Astrocytes,Lamp5,0.312137,-1.679751,0.000134,0.038272,7.673805,-6.669579,1.572729,DREAM,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000027270
1,Astrocytes,Gata3,6.032899,2.592851,0.000153,0.038272,3.380014,6.545563,1.366111,DREAM,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000015619
2,Astrocytes,Mmp9,3.609200,1.851679,0.000855,0.142474,3.138626,5.079559,-0.181532,DREAM,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000017737
3,Astrocytes,Abcb1a,2.236494,1.161239,0.001944,0.242947,7.128629,4.455991,-1.055956,DREAM,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000040584
4,Astrocytes,Bcl2,1.922956,0.943326,0.005291,0.506050,7.619215,3.747909,-2.083698,DREAM,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000057329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35995,all,Tgfb3,0.997319,-0.003873,0.977760,0.985645,10.225112,-0.028501,-6.738243,DREAM,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000021253
35996,all,Gatm,1.001780,0.002566,0.987958,0.993390,14.624825,0.015431,-6.841523,DREAM,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000027199
35997,all,C1qtnf4,0.998493,-0.002176,0.989417,0.993390,12.965992,-0.013561,-6.834521,DREAM,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000040794
35998,all,Prss23,0.999370,-0.000910,0.995129,0.995203,10.660097,-0.006241,-6.752030,DREAM,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000039405


In [21]:
args.overwrite

True

In [22]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r'[_-]', '', args.subset_key)
name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk"

for (method, test), df_mt in collapsed_df.groupby(["method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{method}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")
        
logger.info(f"Done.")

2025-10-29 17:10:43,599 [INFO]: Export table(s)...
2025-10-29 17:10:46,119 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_DREAM_WT_12vsWT_3.xlsx
2025-10-29 17:10:48,699 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_DREAM_WT_18vsWT_3.xlsx
2025-10-29 17:10:50,938 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_DREAM_WT_24vsWT_3.xlsx
2025-10-29 17:10:53,844 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_DREAM_WT_6vsWT_3.xlsx
2025-10-29 17:10:53,853 [INFO]: Done.


In [23]:
#