In [1]:
# ruff: noqa
import argparse
import logging
import re
import sys
import warnings
from pathlib import Path

import anndata as ad
import anndata2ri
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import scanpy as sc
from rpy2.rinterface_lib.embedded import RRuntimeError
from rpy2.robjects.conversion import localconverter

plt.rcParams["font.family"] = (
    "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
)
plt.rcParams["font.weight"] = "normal"

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark._constants import cell_type_colors
from cellseg_benchmark.adata_utils import plot_pseudobulk_pca
from cellseg_benchmark.dea_utils import (
    add_ensembl_id,
    add_group_sample_counts,
    prepare_adata_for_rpy2,
    pseudobulk_aggregate_and_filter,
    safe_sheet,
)

warnings.filterwarnings("ignore", message=".*Observation names are not unique*")
VALID_METHODS = {"LRT", "QL"}


def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(description="Run edgeR pseudobulk-based DEA")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument(
        "seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model"
    )
    p.add_argument(
        "--sample_key", default="sample", help="obs column for donor/sample ID"
    )
    p.add_argument(
        "--subset_key",
        default="cell_type",
        help="obs column used to subset data (e.g. cell_type, cluster, region)",
    )
    p.add_argument(
        "--subset_values",
        nargs="+",
        default=None,
        help="Values of subset_key to process (default: all unique values)",
    )
    p.add_argument(
        "--condition_key",
        default="genotype",
        help="obs column for condition (e.g. genotype)",
    )
    p.add_argument(
        "--batch_key",
        default="slide",
        help="optional batch key for inclusion as covariate (default: slide)",
    )
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument(
        "--test_groups",
        nargs="+",
        default=None,
        help="Groups to test vs reference (default: all groups in condition_key except --ref)",
    )
    p.add_argument(
        "--edger_methods",
        nargs="+",
        default=["LRT"],
        choices=sorted(VALID_METHODS),
        help="edgeR method(s) to run (default: LRT)",
    )
    p.add_argument("--min_cells", type=int, default=15, help="Minimum cells per donor")
    p.add_argument(
        "--replicates_per_patient",
        type=int,
        default=1,
        help="Number of pseudoreplicates per donor",
    )
    p.add_argument(
        "--overwrite",
        type=lambda x: str(x).lower() in ["true", "1", "yes"],
        default=True,
        help="Overwrite existing result files (default: True)",
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "foxf2",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "genotype",
        "--ref",
        "WT",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "aging",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "condition",
        "--ref",
        "WT_3",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [4]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', batch_key='slide', ref='WT_3', test_groups=None, edger_methods=['LRT'], min_cells=15, replicates_per_patient=1, overwrite=False)

In [5]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False
rcb.logger.handlers = logger.handlers
rcb.consolewrite_print = lambda x: logger.debug(f"R: {x.strip()}")
rcb.consolewrite_error = lambda x: (_ for _ in ()).throw(RRuntimeError(x.strip()))
rcb.consolewrite_message = lambda x: logger.info(f"R: {x.strip()}")
rcb.consolewrite_warn = lambda x: (
    logger.warning if x.lstrip().lower().startswith("warning") else logger.info
)(f"R: {x.strip()}")
setattr(rcb, "consolewrite_warnerror", rcb.consolewrite_warn)

In [6]:
# R setup
conv = ro.default_converter + ro.pandas2ri.converter + anndata2ri.converter
r_script = Path(sys.path[1]) / "cellseg_benchmark" / "dea_utils.r"
ro.r["source"](str(r_script))
edgeR_loop = ro.globalenv["edgeR_loop"]

In [7]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_mmc_raw_revised"]

2025-10-23 00:02:11,808 [INFO]: Loading integrated AnnData...


In [9]:
# re-group cell types
adata.obs["cell_type"] = (
    adata.obs["cell_type"]
    .astype(str)
    .replace({
        "Tanycytes": "Ependymal",
        "Astroependymal": "Astrocytes",
        "Neurons-Glyc-Gaba":"Neurons-Other"
    })
    .astype("category")
)

In [10]:
adata.obs["cell_type"].value_counts()

cell_type
Oligodendrocytes            138351
Neurons-Glut                117502
Neurons-Dopa                 80936
Astrocytes                   73541
ECs                          50945
Neurons-Granule-Immature     43821
Neurons-Other                38412
Undefined                    32515
Microglia                    26673
OPCs                         21036
Pericytes                    15075
VLMCs                         7260
Ependymal                     6527
SMCs                          6195
Choroid-Plexus                5690
Neurons-Gaba                  3266
BAMs                          2076
Immune-Other                   134
Name: count, dtype: int64

In [11]:
# Clean up group names for R conversion
adata.obs[args.subset_key] = [
    key.replace(" ", "_")
    .replace("-", "_")
    .replace("/", "_")
    .replace("+", "")
    .replace("(", "")
    .replace(")", "")
    if not pd.isna(key)
    else key
    for key in adata.obs[args.subset_key]
]
for col in [args.condition_key, args.sample_key, args.subset_key]:
    adata.obs[col] = adata.obs[col].astype("category")
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (
        f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
        if args.test_groups
        else f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'."
    )
    logger.info(msg)

# print sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-10-23 00:02:54,475 [INFO]: Test groups inferred from 'condition': WT_18, WT_24, WT_6, WT_12
2025-10-23 00:02:54,569 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-10-23 00:02:54,571 [INFO]: WT_6: 3 samples → aging_s5_r1, aging_s7_r2, aging_s1_r0
2025-10-23 00:02:54,573 [INFO]: WT_12: 3 samples → aging_s12_r0, aging_s8_r0, aging_s5_r2
2025-10-23 00:02:54,575 [INFO]: WT_18: 3 samples → aging_s11_r0, aging_s6_r0, aging_s8_r2
2025-10-23 00:02:54,576 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [12]:
adata.obs["n_cells_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)[args.sample_key].transform("count")
adata.obs["volume_mean"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("mean")
adata.obs["volume_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("sum")

In [13]:
adata.X = adata.layers["counts"].copy()
assert np.issubdtype(adata.X.dtype, np.integer)

In [14]:
obs_to_keep = [
    args.condition_key,
    args.subset_key,
    args.sample_key,
    "n_cells_sum",
    "volume_mean",
    "volume_sum",
]
if args.batch_key and args.batch_key in adata.obs.columns:
    obs_to_keep.append(args.batch_key)
else:
    logger.warning(
        f"Batch column '{args.batch_key}' was not found in adata.obs "
        "or is invalid. Continuing **without** a batch covariate."
    )
    args.batch_key = None

In [15]:
logger.info("Run pseudobulking...")
total = len(groups_to_process)
logger.info(f"Processing {groups_to_process[0]} ({1}/{total})")
adata_pb = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=groups_to_process[0],
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
for i, group in enumerate(groups_to_process[1:], 2):
    logger.info(f"Processing {group} ({i}/{total})")
    adata_pb_i = pseudobulk_aggregate_and_filter(
        adata,
        subset_value=group,
        sample_key=args.sample_key,
        subset_key=args.subset_key,
        obs_to_keep=obs_to_keep,
        min_cells=args.min_cells,
        replicates_per_patient=args.replicates_per_patient,
        logger=logger,
    )
    adata_pb = ad.concat(
        [adata_pb, adata_pb_i], join="outer", label=None, index_unique=None
    )
adata_pb.obs_names_make_unique()
# Add whole brain: pseudobulk per sample across all groups
adata_pb_i = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=None,
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
adata_pb_i.obs[args.subset_key] = "all"
adata_pb = ad.concat([adata_pb, adata_pb_i])
del adata

2025-10-23 00:02:55,183 [INFO]: Run pseudobulking...
2025-10-23 00:02:55,185 [INFO]: Processing Astrocytes (1/18)
2025-10-23 00:02:56,419 [INFO]: 	Processing donor 1/15...
2025-10-23 00:02:58,896 [INFO]: 	Processing donor 15/15...
2025-10-23 00:02:59,050 [INFO]: Processing BAMs (2/18)
2025-10-23 00:02:59,230 [INFO]: 	Processing donor 1/15...
2025-10-23 00:03:00,977 [INFO]: 	Processing donor 15/15...
2025-10-23 00:03:01,100 [INFO]: Processing Choroid_Plexus (3/18)
2025-10-23 00:03:01,298 [INFO]: 	Processing donor 1/15...
2025-10-23 00:03:03,077 [INFO]: 	Processing donor 15/15...
2025-10-23 00:03:03,201 [INFO]: Processing ECs (4/18)
2025-10-23 00:03:04,054 [INFO]: 	Processing donor 1/15...
2025-10-23 00:03:06,397 [INFO]: 	Processing donor 15/15...
2025-10-23 00:03:06,557 [INFO]: Processing Ependymal (5/18)
2025-10-23 00:03:06,769 [INFO]: 	Processing donor 1/15...
2025-10-23 00:03:08,555 [INFO]: 	Processing donor 15/15...
2025-10-23 00:03:08,679 [INFO]: Processing Immune_Other (6/18)
2025

In [16]:
plot_pseudobulk_pca(adata_pb, args, output_dir, cell_type_colors, logger)

2025-10-23 00:04:01,892 [INFO]: Skipping PCA plot, file already exists: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/pseudobulk_pca.png


In [17]:
logger.info("Run DEA...")
adata_pb = prepare_adata_for_rpy2(adata_pb, key=args.subset_key)
adatas_pb = {}
unique_groups = adata_pb.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata_pb[adata_pb.obs[args.subset_key] == key].copy()
    adatas_pb[key] = tmp

2025-10-23 00:04:01,922 [INFO]: Run DEA...


In [18]:
adatas_pb

{'Astrocytes': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'BAMs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Choroid_Plexus': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'ECs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Ependymal': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Immune_Other': AnnData object with n_obs × n_vars = 1 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Microglia': AnnData object with n_obs × n_vars 

In [19]:
adatas_pb["ECs"].X.todense()

matrix([[1644., 7431., 3396., ..., 3761., 5571., 1580.],
        [ 524., 3569., 1275., ..., 1472., 2684., 1085.],
        [ 671., 3929.,  967., ..., 1447., 1938.,  534.],
        ...,
        [ 484., 2351., 1295., ...,  864., 2820., 1084.],
        [ 574., 3089., 1070., ..., 1147., 3052., 1370.],
        [  96., 1211.,  541., ...,  309., 1141.,  909.]],
       shape=(15, 500), dtype=float32)

In [24]:
# explore sce file in r
# skip when testing script!

In [25]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [26]:
group_i = "ECs"
adata = adatas_pb[group_i]
condition_col = args.condition_key
test_groups = args.test_groups
ref_group = args.ref
batch_col = args.batch_key
edger_methods = args.edger_methods

In [27]:
edger_methods

['LRT']

In [28]:
batch_col

'slide'

In [29]:
condition_col

'condition'

In [30]:
test_groups

['WT_18', 'WT_24', 'WT_6', 'WT_12']

In [31]:
adata.obs

Unnamed: 0,condition,cell_type,n_cells_sum,volume_mean,volume_sum,slide,sample
donor_aging_s1_r0_0-3,WT_6,ECs,4115,1559.523289,6417438.0,1,aging_s1_r0
donor_aging_s5_r1_0-3,WT_6,ECs,2893,1452.749327,4202804.0,5,aging_s5_r1
donor_aging_s5_r2_0-3,WT_12,ECs,3190,1499.69978,4784042.0,5,aging_s5_r2
donor_aging_s6_r0_0-3,WT_18,ECs,3395,1326.12589,4502197.0,6,aging_s6_r0
donor_aging_s7_r2_0-3,WT_6,ECs,3566,1313.579329,4684224.0,7,aging_s7_r2
donor_aging_s8_r0_0-3,WT_12,ECs,3351,1445.541813,4844011.0,8,aging_s8_r0
donor_aging_s8_r1_0-3,WT_3,ECs,3692,1426.136533,5265296.0,8,aging_s8_r1
donor_aging_s8_r2_0-3,WT_18,ECs,3951,1486.435029,5872905.0,8,aging_s8_r2
donor_aging_s10_r0_0-3,WT_24,ECs,3215,1488.239998,4784692.0,10,aging_s10_r0
donor_aging_s10_r1_0-3,WT_3,ECs,3052,1489.293251,4545323.0,10,aging_s10_r1


In [32]:
%%R -c conv -i adata -i condition_col -i test_groups -i group_i -i ref_group -i batch_col -i edger_methods -o design

condition <- droplevels(factor(colData(adata)[[condition_col]]))
condition <- stats::relevel(condition, ref = ref_group)

# use batch covariate if exists and has >1 level
use_batch <- !is.null(batch_col) && batch_col %in% colnames(colData(adata)) &&
             nlevels(droplevels(factor(colData(adata)[[batch_col]]))) > 1

design <- if (use_batch) {
  batch <- droplevels(factor(colData(adata)[[batch_col]]))
  model.matrix(~ condition + batch)
} else {
  model.matrix(~ condition)
}

message("  Design: ~ ", condition_col, if (use_batch) paste0(" + ", batch_col) else "")

rdof <- nrow(design) - qr(design)$rank
if (rdof <= 0) {
  message("  Skip: no residual df (likely too few samples or condition confounded with batch)")
  return(NULL)
}

  Design: ~ condition + slide


In [33]:
%%R
libsize <- colSums(SummarizedExperiment::assay(adata, "X"))
vol_sum <- colData(adata)$volume_sum
cor(libsize, vol_sum)

In [35]:
%%R
min_count = 2
y <- edgeR::DGEList(assay(adata, "X"), group = condition)
keep <- edgeR::filterByExpr(y, min.count = min_count, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]

In [None]:
%%R
# replace library sizes with volume
vol <- colData(adata)[["volume_sum"]]
if (is.null(vol) || any(vol <= 0)) stop("Invalid volume in colData(sce), expects 'volume_sum'.")
y$samples$lib.size <- vol

In [44]:
%%R -o s # noqa: F821
y <- edgeR::calcNormFactors(y, method="TMM")
s <- y$samples

In [27]:
s  # noqa: F821

Unnamed: 0,group,lib.size,norm.factors
donor_foxf2_s1_r0_0-4,ECKO,5399797.0,1.326654
donor_foxf2_s1_r1_0-4,WT,4675381.0,1.33367
donor_foxf2_s2_r1_0-4,WT,4901553.0,0.871858
donor_foxf2_s2_r2_0-4,PCKO,5305623.0,0.914129
donor_foxf2_s4_r0_0-3,PCKO,5675190.0,1.290681
donor_foxf2_s4_r1_0-4,ECKO,4382561.0,1.403576
donor_foxf2_s5_r0_0-4,PCKO,6484655.0,0.78573
donor_foxf2_s5_r1_0-4,ECKO,4117196.0,0.59662
donor_foxf2_s6_r0_0-4,GLKO,4464442.0,0.975105
donor_foxf2_s6_r1_0-4,GLKO,5271398.0,0.851289


In [46]:
design  # noqa: F821

array([[1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [47]:
%%R
condition <- droplevels(factor(colData(adata)[[condition_col]]))

if (is.null(condition) || nlevels(condition) < 2) {
  message(group_i, ": skip (condition invalid or <2 levels)")
  return(NULL)
}

present <- intersect(test_groups, levels(condition))

if (!(ref_group %in% levels(condition)) || length(present) == 0) {
  message(group_i, ": skip (ref_group/test_groups missing)")
  return(NULL)
}

missing <- setdiff(test_groups, present)

message(
  group_i, ": ", paste(present, collapse = "/"), " vs ", ref_group,
  if (length(missing)) paste0(" (missing:", paste(missing, collapse = ","), ")") else ""
)

ECs: WT_18/WT_24/WT_6/WT_12 vs WT_3


In [48]:
%%R
edgeR_run_test <- function(fit, design, edger_method, test_group, ref_group){
  
  c_test <- paste0("condition", make.names(test_group))
  c_ref  <- paste0("condition", make.names(ref_group))

  if (c_test %in% colnames(design) && !(c_ref %in% colnames(design))) {
    # intercept design
    tt <- if (edger_method == "QL") edgeR::glmQLFTest(fit, coef = c_test)
          else                      edgeR::glmLRT(fit,  coef = c_test)

  } else if (all(c(c_test, c_ref) %in% colnames(design))) {
    # no-intercept design
    con <- limma::makeContrasts(contrasts = sprintf("%s-%s", c_test, c_ref), levels = design)
    tt  <- if (edger_method == "QL") edgeR::glmQLFTest(fit, contrast = con)
           else                      edgeR::glmLRT(fit,  contrast = con)

  } else {
    stop("Cannot form contrast. Needed ", c_test,
         if (c_ref %in% colnames(design)) "" else paste0(" and ", c_ref),
         ". Available columns: ", paste(colnames(design), collapse=", "))
  }

  de <- edgeR::topTags(tt, n = Inf)$table
  de$test <- paste0(test_group, "vs", ref_group)
  de
}


In [49]:
%%R
res <- list()
for (m in edger_methods) {
  o <- edgeR_fit_model(adata, edger_method=m, condition_col=condition_col, ref_group=ref_group, batch_col=batch_col)
  if (is.null(o)) next
  for (tg in present) {
    tmp <- edgeR_run_test(o$fit, o$design, m, tg, ref_group)
    if (is.null(tmp)) next
    tmp <- dplyr::mutate(as.data.frame(tmp), FC = 2^logFC, edger_method = m, test_group = tg, ref = ref_group)
    res[[paste(tg, m, sep="_")]] <- tmp
  }
}

  Design: ~ condition + slide


In [50]:
# %%R
# head(res)

In [51]:
%%R -o out # noqa: F821
if (!length(res)) return(NULL)
out <- dplyr::bind_rows(res, .id="result_id")
out$gene <- sub("[.][.][.].*", "", rownames(out)); rownames(out) <- NULL
out <- dplyr::select(out, gene, FC, dplyr::everything())

In [52]:
out  # noqa: F821

Unnamed: 0,gene,FC,result_id,logFC,logCPM,LR,PValue,FDR,test,edger_method,test_group,ref
1,C4b,2.940797,WT_18_LRT,1.556207,6.869068,2.887195e+01,7.732463e-08,0.000039,WT_18vsWT_3,LRT,WT_18,WT_3
2,Itm2a,0.515996,WT_18_LRT,-0.954567,9.858652,2.445210e+01,7.618063e-07,0.000190,WT_18vsWT_3,LRT,WT_18,WT_3
3,Car4,0.503968,WT_18_LRT,-0.988595,8.132427,1.861105e+01,1.602887e-05,0.002589,WT_18vsWT_3,LRT,WT_18,WT_3
4,Atf3,2.209323,WT_18_LRT,1.143605,5.519533,1.812253e+01,2.071364e-05,0.002589,WT_18vsWT_3,LRT,WT_18,WT_3
5,Plaur,1.919512,WT_18_LRT,0.940740,4.799563,1.523990e+01,9.468169e-05,0.007975,WT_18vsWT_3,LRT,WT_18,WT_3
...,...,...,...,...,...,...,...,...,...,...,...,...
1996,Ripk1,0.997411,WT_12_LRT,-0.003740,8.208979,2.007565e-04,9.886953e-01,0.996669,WT_12vsWT_3,LRT,WT_12,WT_3
1997,Mlkl,0.997941,WT_12_LRT,-0.002973,4.795492,1.123394e-04,9.915434e-01,0.997529,WT_12vsWT_3,LRT,WT_12,WT_3
1998,Lamc2,0.998509,WT_12_LRT,-0.002152,4.529276,5.820098e-05,9.939130e-01,0.997905,WT_12vsWT_3,LRT,WT_12,WT_3
1999,Fezf2,0.999521,WT_12_LRT,-0.000692,6.472864,4.609824e-06,9.982869e-01,0.999358,WT_12vsWT_3,LRT,WT_12,WT_3


In [None]:
# end

In [20]:
all_degs = {}
for group_i in adatas_pb:
    adata_tmp = adatas_pb[group_i]
    with localconverter(conv):
        combined_results = edgeR_loop(
            adata=adata_tmp,
            group_i=group_i,
            edger_methods=args.edger_methods,
            test_groups=args.test_groups,
            ref_group=args.ref,
            condition_col=args.condition_key,
            batch_col=args.batch_key,
        )
    if combined_results is not ro.NULL:
        all_degs[group_i] = combined_results

2025-10-23 00:04:04,881 [INFO]: R: Astrocytes: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:04,917 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:08,810 [INFO]: R: BAMs: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:08,817 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:14,774 [INFO]: R: Choroid_Plexus: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:14,782 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:19,450 [INFO]: R: ECs: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:19,458 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:23,264 [INFO]: R: Ependymal: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:23,271 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:27,350 [INFO]: R: Immune_Other: skip (condition invalid or <2 levels)
2025-10-23 00:04:27,465 [INFO]: R: Microglia: WT_18/WT_24/WT_6/WT_12 vs WT_3
2025-10-23 00:04:27,472 [INFO]: R: Design: ~ condition + slide
2025-10-23 00:04:31,983 [INFO]: R: Neurons_Dopa: WT_18/WT_24/WT_6/WT_12 vs WT_

In [21]:
logger.info("Format output table...")
collapsed_df = pd.concat(
    [pd.DataFrame(v).assign(subset=k) for k, v in all_degs.items()], ignore_index=True
).drop(columns=["result_id"], errors="ignore")

collapsed_df = add_group_sample_counts(
    collapsed_df,
    adatas_pb,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset",
)
collapsed_df = collapsed_df.set_index("gene")
collapsed_df.insert(collapsed_df.columns.get_loc("FC"), "gene", collapsed_df.index)

order = [
    "subset",
    "gene",
    "FC",
    "logFC",
    "PValue",
    "FDR",
    "logCPM",
    "LR",
    "edgeR_method",
    "test_group",
    "ref",
    "test",
]
collapsed_df = collapsed_df[order + [c for c in collapsed_df.columns if c not in order]]

logger.info("Add ensembl gene ids...")
collapsed_df = add_ensembl_id(collapsed_df, logger=logger)

2025-10-23 00:05:17,837 [INFO]: Format output table...
2025-10-23 00:05:17,952 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['Ctps', 'H2afj', 'Pifo']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-10-23 00:05:21,597 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-10-23 00:05:21,598 [INFO]: Renamed aliases: Ctps->Ctps1, H2afj->H2aj, Pifo->Cimap3


In [22]:
collapsed_df

Unnamed: 0_level_0,subset,gene,FC,logFC,PValue,FDR,logCPM,LR,edgeR_method,test_group,ref,test,test_group_n,ref_n,ensembl_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C4b,Astrocytes,C4b,2.330695,1.220760,1.102029e-07,0.000055,12.189814,2.818593e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000073418
Lcn2,Astrocytes,Lcn2,3.877134,1.954991,5.524014e-06,0.001381,6.842015,2.064643e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000026822
Serpina3n,Astrocytes,Serpina3n,1.969149,0.977572,2.182935e-05,0.003638,11.955073,1.802264e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000021091
Cd74,Astrocytes,Cd74,2.017040,1.012240,3.410956e-05,0.004264,6.840595,1.717387e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000024610
Il18,Astrocytes,Il18,0.600681,-0.735330,6.927245e-05,0.006927,10.615429,1.583062e+01,LRT,WT_18,WT_3,WT_18vsWT_3,3,3,ENSMUSG00000039217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sspo,all,Sspo,0.999327,-0.000971,9.960660e-01,0.999576,7.504450,2.431010e-05,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000029797
Tgfb3,all,Tgfb3,0.999614,-0.000557,9.976724e-01,0.999576,8.562949,8.510056e-06,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000021253
Pdgfra,all,Pdgfra,0.999785,-0.000310,9.987238e-01,0.999576,10.338063,2.558459e-06,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000029231
Slc2a1,all,Slc2a1,1.000193,0.000279,9.988010e-01,0.999576,7.129417,2.258308e-06,LRT,WT_12,WT_3,WT_12vsWT_3,3,3,ENSMUSG00000028645


In [23]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r"[_-]", "", args.subset_key)
name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR"

for (edgeR_method, test), df_mt in collapsed_df.groupby(["edgeR_method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{edgeR_method}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")

logger.info("Done.")

2025-10-23 00:05:21,669 [INFO]: Export table(s)...
2025-10-23 00:05:23,403 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR-libnorm_LRT_WT_12vsWT_3.xlsx
2025-10-23 00:05:24,956 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR-libnorm_LRT_WT_18vsWT_3.xlsx
2025-10-23 00:05:26,520 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR-libnorm_LRT_WT_24vsWT_3.xlsx
2025-10-23 00:05:28,214 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR-libnorm_LRT_WT_6vsWT_3.xlsx
2025-10-23 00:05:28,217 [INFO]: Done.
