In [1]:
import argparse
import logging
import re
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import rpy2.robjects as ro
import anndata2ri
import rpy2.rinterface_lib.callbacks as rcb
from rpy2.robjects.conversion import localconverter
from rpy2.rinterface_lib.embedded import RRuntimeError

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
plt.rcParams["font.family"] = "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
plt.rcParams['font.weight'] = 'normal'

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark.adata_utils import plot_pseudobulk_pca
from cellseg_benchmark._constants import cell_type_colors
from cellseg_benchmark.dea_utils import (
    pseudobulk_aggregate_and_filter,
    prepare_adata_for_rpy2,
    add_group_sample_counts,
    add_ensembl_id,
    safe_sheet,
)

warnings.filterwarnings("ignore", message=".*Observation names are not unique*")
VALID_METHODS = {"LRT", "QL"}

def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(description="Run edgeR pseudobulk-based DEA")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument("seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model")
    p.add_argument("--sample_key", default="sample", help="obs column for donor/sample ID")
    p.add_argument("--subset_key", default="cell_type",
                        help="obs column used to subset data (e.g. cell_type, cluster, region)")
    p.add_argument("--subset_values", nargs="+", default=None, 
                        help="Values of subset_key to process (default: all unique values)")
    p.add_argument("--condition_key", default="genotype", help="obs column for condition (e.g. genotype)")
    p.add_argument("--batch_key", default="slide", 
                       help="optional batch key for inclusion as covariate (default: slide)")
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument("--test_groups", nargs="+", default=None,
                        help="Groups to test vs reference (default: all groups in condition_key except --ref)")
    p.add_argument("--edger_methods", nargs="+", default=["LRT"], choices=sorted(VALID_METHODS),
                        help="edgeR method(s) to run (default: LRT)")
    p.add_argument("--min_cells", type=int, default=15, help="Minimum cells per donor")
    p.add_argument("--replicates_per_patient", type=int, default=1,
                        help="Number of pseudoreplicates per donor")
    p.add_argument(
        "--overwrite", 
        type=lambda x: str(x).lower() in ["true","1","yes"],
        default=True,
        help="Overwrite existing result files (default: True)"
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "foxf2",
    "Cellpose_1_nuclei_model",
    "--sample_key", "sample",
    "--subset_key", "cell_type",
    "--condition_key", "genotype",
    "--ref", "WT",
    "--edger_methods", "LRT",
    "--overwrite", "False"
])

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args([
    "aging",
    "Cellpose_1_nuclei_model",
    "--sample_key", "sample",
    "--subset_key", "cell_type",
    "--condition_key", "condition",
    "--ref", "WT_3",
    "--edger_methods", "LRT",
    "--overwrite", "False"
])

In [4]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', batch_key='slide', ref='WT_3', test_groups=None, edger_methods=['LRT'], min_cells=15, replicates_per_patient=1, overwrite=False)

In [5]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False
rcb.logger.handlers = logger.handlers
rcb.consolewrite_print   = lambda x: logger.debug(f"R: {x.strip()}")
rcb.consolewrite_error   = lambda x: (_ for _ in ()).throw(RRuntimeError(x.strip()))
rcb.consolewrite_message = lambda x: logger.info(f"R: {x.strip()}")
rcb.consolewrite_warn = lambda x: (logger.warning if x.lstrip().lower().startswith("warning") else logger.info)(f"R: {x.strip()}")
setattr(rcb, "consolewrite_warnerror", rcb.consolewrite_warn)

In [6]:
# R setup
conv = ro.default_converter + ro.pandas2ri.converter + anndata2ri.converter
r_script = Path(sys.path[1]) / "cellseg_benchmark" / "dea_utils.r"
ro.r["source"](str(r_script))
edgeR_loop = ro.globalenv["edgeR_loop"]

In [7]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [9]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_mmc_raw_revised"]

2025-09-30 10:38:43,174 [INFO]: Loading integrated AnnData...


In [10]:
# Clean up group names for R conversion
adata.obs[args.subset_key] = [
    key.replace(" ", "_").replace("-", "_").replace("/", "_")
      .replace("+", "").replace("(", "").replace(")", "") 
    if not pd.isna(key) else key 
    for key in adata.obs[args.subset_key]
]
for col in [args.condition_key, args.sample_key, args.subset_key]:
    adata.obs[col] = adata.obs[col].astype("category")
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
           if args.test_groups else
           f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'.")
    logger.info(msg)
    
# print sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-09-30 10:43:19,525 [INFO]: Test groups inferred from 'condition': WT_18, WT_24, WT_6, WT_12
2025-09-30 10:43:19,616 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-09-30 10:43:19,620 [INFO]: WT_6: 3 samples → aging_s5_r1, aging_s7_r2, aging_s1_r0
2025-09-30 10:43:19,622 [INFO]: WT_12: 3 samples → aging_s12_r0, aging_s8_r0, aging_s5_r2
2025-09-30 10:43:19,624 [INFO]: WT_18: 3 samples → aging_s11_r0, aging_s6_r0, aging_s8_r2
2025-09-30 10:43:19,626 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [11]:
adata.obs['n_cells_sum'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)[args.sample_key].transform('count')
adata.obs['volume_mean'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)['volume_final'].transform('mean')
adata.obs['volume_sum'] = adata.obs.groupby([args.subset_key, args.sample_key], observed=True)['volume_final'].transform('sum')

In [12]:
adata.X = adata.layers['counts'].copy()
assert np.issubdtype(adata.X.dtype, np.integer)

In [13]:
obs_to_keep = [
    args.condition_key,
    args.subset_key,
    args.sample_key,
    "n_cells_sum",
    "volume_mean",
    "volume_sum",
]
if args.batch_key and args.batch_key in adata.obs.columns:
    obs_to_keep.append(args.batch_key)
else:
    logger.warning(
        f"Batch column '{args.batch_key}' was not found in adata.obs "
        "or is invalid. Continuing **without** a batch covariate."
    )
    args.batch_key = None

In [14]:
logger.info(f"Run pseudobulking...")
total = len(groups_to_process)
logger.info(f"Processing {groups_to_process[0]} ({1}/{total})")
adata_pb = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=groups_to_process[0],
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger
)
for i, group in enumerate(groups_to_process[1:], 2):
    logger.info(f"Processing {group} ({i}/{total})")
    adata_pb_i = pseudobulk_aggregate_and_filter(
        adata,
        subset_value=group,
        sample_key=args.sample_key,
        subset_key=args.subset_key,
        obs_to_keep=obs_to_keep,
        min_cells=args.min_cells,
        replicates_per_patient=args.replicates_per_patient,
        logger=logger
    )
    adata_pb = ad.concat([adata_pb, adata_pb_i], join="outer", label=None, index_unique=None)
adata_pb.obs_names_make_unique()
# Add whole brain: pseudobulk per sample across all groups
adata_pb_i = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=None,
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger
)
adata_pb_i.obs[args.subset_key] = "all"
adata_pb = ad.concat([adata_pb, adata_pb_i])
del adata

2025-09-30 10:43:22,115 [INFO]: Run pseudobulking...
2025-09-30 10:43:22,119 [INFO]: Processing Astrocytes (1/21)
2025-09-30 10:43:22,729 [INFO]: 	Processing donor 1/15...
2025-09-30 10:43:24,822 [INFO]: 	Processing donor 15/15...
2025-09-30 10:43:24,957 [INFO]: Processing Astroependymal (2/21)
2025-09-30 10:43:25,877 [INFO]: 	Processing donor 1/15...
2025-09-30 10:43:28,261 [INFO]: 	Processing donor 15/15...
2025-09-30 10:43:28,416 [INFO]: Processing BAMs (3/21)
2025-09-30 10:43:28,598 [INFO]: 	Processing donor 1/15...
2025-09-30 10:43:30,436 [INFO]: 	Processing donor 15/15...
2025-09-30 10:43:30,563 [INFO]: Processing Choroid_Plexus (4/21)
2025-09-30 10:43:30,770 [INFO]: 	Processing donor 1/15...
2025-09-30 10:43:32,651 [INFO]: 	Processing donor 15/15...
2025-09-30 10:43:32,777 [INFO]: Processing ECs (5/21)
2025-09-30 10:43:33,767 [INFO]: 	Processing donor 1/15...
2025-09-30 10:43:36,241 [INFO]: 	Processing donor 15/15...
2025-09-30 10:43:36,399 [INFO]: Processing Ependymal (6/21)
20

In [15]:
plot_pseudobulk_pca(adata_pb, args, output_dir, cell_type_colors, logger)

2025-09-30 10:44:39,862 [INFO]: Skipping PCA plot, file already exists: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/pseudobulk_pca.png


In [16]:
logger.info(f"Run DEA...")
adata_pb = prepare_adata_for_rpy2(adata_pb, key=args.subset_key)
adatas_pb = {}
unique_groups = adata_pb.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata_pb[adata_pb.obs[args.subset_key] == key].copy()
    adatas_pb[key] = tmp

2025-09-30 10:44:39,901 [INFO]: Run DEA...


In [17]:
adatas_pb

{'Astrocytes': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Astroependymal': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'BAMs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Choroid_Plexus': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'ECs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Ependymal': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Immune_Other': AnnData object with n_obs × n

In [20]:
adatas_pb["ECs"].X.todense()

matrix([[1644., 7431., 3396., ..., 3761., 5571., 1580.],
        [ 524., 3569., 1275., ..., 1472., 2684., 1085.],
        [ 671., 3929.,  967., ..., 1447., 1938.,  534.],
        ...,
        [ 484., 2351., 1295., ...,  864., 2820., 1084.],
        [ 574., 3089., 1070., ..., 1147., 3052., 1370.],
        [  96., 1211.,  541., ...,  309., 1141.,  909.]],
       shape=(15, 500), dtype=float32)

In [16]:
# explore sce file in r
# skip when testing script!

In [17]:
%load_ext rpy2.ipython

In [18]:
group_i = "ECs"
adata = adatas_pb[group_i]
condition_col = args.condition_key
test_groups = args.test_groups
ref_group = args.ref
batch_col = args.batch_key
edger_methods = args.edger_methods

In [19]:
edger_methods

['LRT']

In [20]:
batch_col

'slide'

In [21]:
condition_col

'genotype'

In [22]:
test_groups

['GLKO', 'ECKO', 'PCKO']

In [23]:
adata.obs

Unnamed: 0,genotype,cell_type,slide,n_cells_sum,volume_mean,volume_sum,sample
donor_foxf2_s1_r0_0-4,ECKO,ECs,1,3770,1432.30689,5399797.0,foxf2_s1_r0
donor_foxf2_s1_r1_0-4,WT,ECs,1,3244,1441.239386,4675381.0,foxf2_s1_r1
donor_foxf2_s2_r1_0-4,WT,ECs,2,3709,1321.529465,4901553.0,foxf2_s2_r1
donor_foxf2_s2_r2_0-4,PCKO,ECs,2,3730,1422.418953,5305623.0,foxf2_s2_r2
donor_foxf2_s4_r0_0-3,PCKO,ECs,4,4136,1372.144486,5675190.0,foxf2_s4_r0
donor_foxf2_s4_r1_0-4,ECKO,ECs,4,2965,1478.098312,4382561.0,foxf2_s4_r1
donor_foxf2_s5_r0_0-4,PCKO,ECs,5,4889,1326.376653,6484655.0,foxf2_s5_r0
donor_foxf2_s5_r1_0-4,ECKO,ECs,5,3511,1172.656272,4117196.0,foxf2_s5_r1
donor_foxf2_s6_r0_0-4,GLKO,ECs,6,2986,1495.124579,4464442.0,foxf2_s6_r0
donor_foxf2_s6_r1_0-4,GLKO,ECs,6,3486,1512.162401,5271398.0,foxf2_s6_r1


In [24]:
%%R -c conv -i adata -i condition_col -i test_groups -i group_i -i ref_group -i batch_col -i edger_methods -o design

condition <- droplevels(factor(colData(adata)[[condition_col]]))
condition <- stats::relevel(condition, ref = ref_group)

# use batch covariate if exists and has >1 level
use_batch <- !is.null(batch_col) && batch_col %in% colnames(colData(adata)) &&
             nlevels(droplevels(factor(colData(adata)[[batch_col]]))) > 1

design <- if (use_batch) {
  batch <- droplevels(factor(colData(adata)[[batch_col]]))
  model.matrix(~ condition + batch)
} else {
  model.matrix(~ condition)
}

message("  Design: ~ ", condition_col, if (use_batch) paste0(" + ", batch_col) else "")

rdof <- nrow(design) - qr(design)$rank
if (rdof <= 0) {
  message("  Skip: no residual df (likely too few samples or condition confounded with batch)")
  return(NULL)
}

  Design: ~ genotype + slide


In [25]:
%%R
libsize <- colSums(SummarizedExperiment::assay(adata, "X"))
vol_sum <- colData(adata)$volume_sum
cor(libsize, vol_sum)

2025-09-03 16:15:38,411 [INFO]: R: [1]
2025-09-03 16:15:38,412 [INFO]: R: 0.3577262
2025-09-03 16:15:38,413 [INFO]: R: 


In [26]:
%%R -o s
min_count = 2
y <- edgeR::DGEList(assay(adata, "X"), group = condition)
keep <- edgeR::filterByExpr(y, min.count = min_count, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]

# replace library sizes with volume
vol <- colData(adata)[["volume_sum"]]
if (is.null(vol) || any(vol <= 0)) stop("Invalid volume in colData(sce), expects 'volume_sum'.")
y$samples$lib.size <- vol
y <- edgeR::calcNormFactors(y)

s <- y$samples

In [27]:
s

Unnamed: 0,group,lib.size,norm.factors
donor_foxf2_s1_r0_0-4,ECKO,5399797.0,1.326654
donor_foxf2_s1_r1_0-4,WT,4675381.0,1.33367
donor_foxf2_s2_r1_0-4,WT,4901553.0,0.871858
donor_foxf2_s2_r2_0-4,PCKO,5305623.0,0.914129
donor_foxf2_s4_r0_0-3,PCKO,5675190.0,1.290681
donor_foxf2_s4_r1_0-4,ECKO,4382561.0,1.403576
donor_foxf2_s5_r0_0-4,PCKO,6484655.0,0.78573
donor_foxf2_s5_r1_0-4,ECKO,4117196.0,0.59662
donor_foxf2_s6_r0_0-4,GLKO,4464442.0,0.975105
donor_foxf2_s6_r1_0-4,GLKO,5271398.0,0.851289


In [28]:
design

array([[1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 0., 0., 1.]])

In [29]:
%%R
condition <- droplevels(factor(colData(adata)[[condition_col]]))

if (is.null(condition) || nlevels(condition) < 2) {
  message(group_i, ": skip (condition invalid or <2 levels)")
  return(NULL)
}

present <- intersect(test_groups, levels(condition))

if (!(ref_group %in% levels(condition)) || length(present) == 0) {
  message(group_i, ": skip (ref_group/test_groups missing)")
  return(NULL)
}

missing <- setdiff(test_groups, present)

message(
  group_i, ": ", paste(present, collapse = "/"), " vs ", ref_group,
  if (length(missing)) paste0(" (missing:", paste(missing, collapse = ","), ")") else ""
)

ECs: GLKO/ECKO/PCKO vs WT


In [30]:
%%R
edgeR_run_test <- function(fit, design, edger_method, test_group, ref_group){
  
  c_test <- paste0("condition", make.names(test_group))
  c_ref  <- paste0("condition", make.names(ref_group))

  if (c_test %in% colnames(design) && !(c_ref %in% colnames(design))) {
    # intercept design
    tt <- if (edger_method == "QL") edgeR::glmQLFTest(fit, coef = c_test)
          else                      edgeR::glmLRT(fit,  coef = c_test)

  } else if (all(c(c_test, c_ref) %in% colnames(design))) {
    # no-intercept design
    con <- limma::makeContrasts(contrasts = sprintf("%s-%s", c_test, c_ref), levels = design)
    tt  <- if (edger_method == "QL") edgeR::glmQLFTest(fit, contrast = con)
           else                      edgeR::glmLRT(fit,  contrast = con)

  } else {
    stop("Cannot form contrast. Needed ", c_test,
         if (c_ref %in% colnames(design)) "" else paste0(" and ", c_ref),
         ". Available columns: ", paste(colnames(design), collapse=", "))
  }

  de <- edgeR::topTags(tt, n = Inf)$table
  de$test <- paste0(test_group, "vs", ref_group)
  de
}


In [41]:
%%R
res <- list()
for (m in edger_methods) {
  o <- edgeR_fit_model(adata, edger_method=m, condition_col=condition_col, ref_group=ref_group, batch_col=batch_col)
  if (is.null(o)) next
  for (tg in present) {
    tmp <- edgeR_run_test(o$fit, o$design, m, tg, ref_group)
    if (is.null(tmp)) next
    tmp <- dplyr::mutate(as.data.frame(tmp), FC = 2^logFC, edger_method = m, test_group = tg, ref = ref_group)
    res[[paste(tg, m, sep="_")]] <- tmp
  }
}

  Design: ~ genotype + slide


In [42]:
#%%R
#head(res)

In [43]:
%%R -o out
if (!length(res)) return(NULL)
out <- dplyr::bind_rows(res, .id="result_id")
out$gene <- sub("[.][.][.].*", "", rownames(out)); rownames(out) <- NULL
out <- dplyr::select(out, gene, FC, dplyr::everything())

In [44]:
out

Unnamed: 0,gene,FC,result_id,logFC,logCPM,LR,PValue,FDR,test,edger_method,test_group,ref
1,Foxf2,0.135548,GLKO_LRT,-2.883127,9.629642,142.983221,5.927883e-33,2.963942e-30,GLKOvsWT,LRT,GLKO,WT
2,Slc16a1,0.575650,GLKO_LRT,-0.796735,10.559661,23.870296,1.030496e-06,2.576241e-04,GLKOvsWT,LRT,GLKO,WT
3,Edn1,2.231627,GLKO_LRT,1.158096,9.309495,19.775418,8.709567e-06,1.451595e-03,GLKOvsWT,LRT,GLKO,WT
4,Depp1,1.715268,GLKO_LRT,0.778434,8.380976,16.067008,6.114000e-05,7.515391e-03,GLKOvsWT,LRT,GLKO,WT
5,Atf3,0.411034,GLKO_LRT,-1.282671,7.430543,15.676455,7.515391e-05,7.515391e-03,GLKOvsWT,LRT,GLKO,WT
...,...,...,...,...,...,...,...,...,...,...,...,...
1496,Pdgfra,0.999148,PCKO_LRT,-0.001230,7.776596,0.000019,9.965334e-01,9.987309e-01,PCKOvsWT,LRT,PCKO,WT
1497,Itgax,0.999406,PCKO_LRT,-0.000857,4.959603,0.000010,9.974660e-01,9.987309e-01,PCKOvsWT,LRT,PCKO,WT
1498,Gad2,1.000511,PCKO_LRT,0.000737,9.161317,0.000009,9.976125e-01,9.987309e-01,PCKOvsWT,LRT,PCKO,WT
1499,Itga2b,0.999647,PCKO_LRT,-0.000509,6.896683,0.000003,9.986894e-01,9.987309e-01,PCKOvsWT,LRT,PCKO,WT


In [None]:
# end

In [16]:
all_degs = {}
for group_i in adatas_pb:
    adata_tmp = adatas_pb[group_i]    
    with localconverter(conv):
        combined_results = edgeR_loop(adata=adata_tmp, group_i=group_i, edger_methods=args.edger_methods, test_groups=args.test_groups, ref_group=args.ref, 
                                      condition_col=args.condition_key, batch_col=args.batch_key)
    if combined_results is not ro.NULL:
        all_degs[group_i] = combined_results

2025-09-19 15:19:18,842 [INFO]: R: Astrocytes: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:18,894 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:21,730 [INFO]: R: Astroependymal: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:21,736 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:24,430 [INFO]: R: BAMs: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:24,436 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:28,558 [INFO]: R: Choroid_Plexus: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:28,563 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:31,296 [INFO]: R: ECs: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:31,302 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:34,654 [INFO]: R: Ependymal: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-09-19 15:19:34,659 [INFO]: R: Design: ~ condition + slide
2025-09-19 15:19:38,205 [INFO]: R: Immune_Other: skip (condition invalid or <2 levels)
2025-09-19 15:19:38,312 [INFO]: R: Microglia: WT_18/WT_24/WT_6/WT_12 vs W

In [17]:
logger.info("Format output table...")
collapsed_df = (
    pd.concat([pd.DataFrame(v).assign(subset=k) for k,v in all_degs.items()], 
              ignore_index=True) 
        .drop(columns=["result_id"], errors="ignore")
)

collapsed_df = add_group_sample_counts(
    collapsed_df, adatas_pb,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset"
)
collapsed_df = collapsed_df.set_index("gene")
collapsed_df.insert(collapsed_df.columns.get_loc("FC"), "gene", collapsed_df.index)

order = ["subset","gene","FC", "logFC", "PValue", "FDR", "logCPM", "LR", "edgeR_method", "test_group", "ref", "test"]
collapsed_df = collapsed_df[order + [c for c in collapsed_df.columns if c not in order]]

logger.info("Add ensembl gene ids...")
collapsed_df = add_ensembl_id(collapsed_df, logger=logger)


2025-09-19 15:20:16,266 [INFO]: Format output table...
2025-09-19 15:20:16,366 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['Ctps', 'Pifo', 'H2afj']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-09-19 15:20:20,141 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-09-19 15:20:20,142 [INFO]: Renamed aliases: Ctps->Ctps1, Pifo->Cimap3, H2afj->H2aj


In [18]:
collapsed_df

Unnamed: 0_level_0,subset,gene,FC,logFC,PValue,FDR,logCPM,LR,edgeR_method,test_group,ref,test,test_group_n,ref_n,ensembl_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C4b,Astrocytes,C4b,4.263428,2.092014,2.971442e-10,1.485721e-07,8.905858,3.969329e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000073418
Il18,Astrocytes,Il18,0.554940,-0.849596,1.628177e-03,4.070443e-01,8.620698,9.927611e+00,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000039217
Foxq1,Astrocytes,Foxq1,1.862377,0.897145,5.288532e-03,7.274869e-01,3.564496,7.778034e+00,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000038415
Ereg,Astrocytes,Ereg,2.214477,1.146966,5.819895e-03,7.274869e-01,2.737630,7.605239e+00,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000029377
Cd74,Astrocytes,Cd74,1.713310,0.776786,1.397695e-02,9.939132e-01,4.231975,6.041052e+00,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000024610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pdgfra,all,Pdgfra,0.998450,-0.002238,9.907111e-01,9.982080e-01,11.925735,1.355406e-04,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000029231
Slc2a1,all,Slc2a1,0.998878,-0.001620,9.930154e-01,9.982080e-01,8.717468,7.663205e-05,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000028645
Wbp2,all,Wbp2,0.998840,-0.001675,9.942152e-01,9.982080e-01,13.595498,5.256594e-05,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000034341
Gfap,all,Gfap,1.000438,0.000631,9.980882e-01,9.993034e-01,13.759149,5.741123e-06,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000020932


In [19]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r'[_-]', '', args.subset_key)
name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR"

for (edgeR_method, test), df_mt in collapsed_df.groupby(["edgeR_method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{edgeR_method}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")
        
logger.info(f"Done.")

2025-09-19 15:20:20,179 [INFO]: Export table(s)...
2025-09-19 15:20:20,214 [INFO]:   Exists, skip: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_LRT_WT_12vsWT_3.xlsx
2025-09-19 15:20:20,223 [INFO]:   Exists, skip: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_LRT_WT_18vsWT_3.xlsx
2025-09-19 15:20:20,231 [INFO]:   Exists, skip: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_LRT_WT_24vsWT_3.xlsx
2025-09-19 15:20:20,239 [INFO]:   Exists, skip: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_LRT_WT_6vsWT_3.xlsx
2025-09-19 15:20:20,241 [INFO]: Done.
