In [1]:
# ruff: noqa
import argparse
import logging
import re
import sys
import warnings
from pathlib import Path

import anndata as ad
import anndata2ri
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import scanpy as sc
from rpy2.rinterface_lib.embedded import RRuntimeError
from rpy2.robjects.conversion import localconverter

plt.rcParams["font.family"] = (
    "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
)
plt.rcParams["font.weight"] = "normal"

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark._constants import cell_type_colors
from cellseg_benchmark.adata_utils import plot_pseudobulk_pca
from cellseg_benchmark.dea_utils import (
    add_ensembl_id,
    add_group_sample_counts,
    prepare_adata_for_rpy2,
    pseudobulk_aggregate_and_filter,
    safe_sheet,
)

warnings.filterwarnings("ignore", message=".*Observation names are not unique*")
VALID_METHODS = {"LRT", "QL"}


def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(description="Run edgeR pseudobulk-based DEA")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument(
        "seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model"
    )
    p.add_argument(
        "--sample_key", default="sample", help="obs column for donor/sample ID"
    )
    p.add_argument(
        "--subset_key",
        default="cell_type",
        help="obs column used to subset data (e.g. cell_type, cluster, region)",
    )
    p.add_argument(
        "--subset_values",
        nargs="+",
        default=None,
        help="Values of subset_key to process (default: all unique values)",
    )
    p.add_argument(
        "--condition_key",
        default="genotype",
        help="obs column for condition (e.g. genotype)",
    )
    p.add_argument(
        "--batch_key",
        default="slide",
        help="optional batch key for inclusion as covariate (default: slide)",
    )
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument(
        "--test_groups",
        nargs="+",
        default=None,
        help="Groups to test vs reference (default: all groups in condition_key except --ref)",
    )
    p.add_argument(
        "--edger_methods",
        nargs="+",
        default=["LRT"],
        choices=sorted(VALID_METHODS),
        help="edgeR method(s) to run (default: LRT)",
    )
    p.add_argument("--min_cells", type=int, default=15, help="Minimum cells per donor")
    p.add_argument(
        "--replicates_per_patient",
        type=int,
        default=1,
        help="Number of pseudoreplicates per donor",
    )
    p.add_argument(
        "--overwrite",
        type=lambda x: str(x).lower() in ["true", "1", "yes"],
        default=True,
        help="Overwrite existing result files (default: True)",
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "foxf2",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "genotype",
        "--ref",
        "WT",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "aging",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "condition",
        "--ref",
        "WT_3",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [4]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', batch_key='slide', ref='WT_3', test_groups=None, edger_methods=['LRT'], min_cells=15, replicates_per_patient=1, overwrite=False)

In [5]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False
rcb.logger.handlers = logger.handlers
# rcb.consolewrite_print = lambda x: logger.debug(f"R: {x.strip()}")
rcb.consolewrite_error = lambda x: (_ for _ in ()).throw(RRuntimeError(x.strip()))
# rcb.consolewrite_message = lambda x: logger.info(f"R: {x.strip()}")
rcb.consolewrite_warn = lambda x: (
    logger.warning if x.lstrip().lower().startswith("warning") else logger.info
)(f"R: {x.strip()}")
setattr(rcb, "consolewrite_warnerror", rcb.consolewrite_warn)

In [6]:
# R setup
conv = ro.default_converter + ro.pandas2ri.converter + anndata2ri.converter
r_script = Path(sys.path[1]) / "cellseg_benchmark" / "dea_utils.r"
ro.r["source"](str(r_script))
edgeR_loop = ro.globalenv["edgeR_loop"]

In [7]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_revised"]

2025-10-31 16:11:27,865 [INFO]: Loading integrated AnnData...


In [9]:
# re-group cell types
adata.obs["cell_type"] = (
    adata.obs["cell_type"]
    .astype(str)
    .replace(
        {
            "Tanycytes": "Ependymal",
            "Astroependymal": "Astrocytes",
            "Neurons-Glyc-Gaba": "Neurons-Other",
        }
    )
    .astype("category")
)

In [10]:
adata.obs["cell_type"].value_counts()

cell_type
Oligodendrocytes            131904
Neurons-Glut                124130
Neurons-Dopa                 77224
Astrocytes                   75424
ECs                          49431
Neurons-Granule-Immature     42337
Neurons-Other                40619
Undefined                    32012
Microglia                    29106
OPCs                         20832
Pericytes                    15056
VLMCs                         8346
Ependymal                     6539
SMCs                          5928
Choroid-Plexus                5657
Neurons-Gaba                  3499
BAMs                          1933
Immune-Other                   566
Name: count, dtype: int64

In [11]:
# Clean up group names for R conversion
adata.obs[args.subset_key] = [
    key.replace(" ", "_")
    .replace("-", "_")
    .replace("/", "_")
    .replace("+", "")
    .replace("(", "")
    .replace(")", "")
    if not pd.isna(key)
    else key
    for key in adata.obs[args.subset_key]
]
for col in [args.condition_key, args.sample_key, args.subset_key]:
    adata.obs[col] = adata.obs[col].astype("category")
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (
        f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
        if args.test_groups
        else f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'."
    )
    logger.info(msg)

# print sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-10-31 16:12:12,017 [INFO]: Test groups inferred from 'condition': WT_6, WT_12, WT_18, WT_24
2025-10-31 16:12:12,111 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-10-31 16:12:12,113 [INFO]: WT_6: 3 samples → aging_s1_r0, aging_s5_r1, aging_s7_r2
2025-10-31 16:12:12,115 [INFO]: WT_12: 3 samples → aging_s5_r2, aging_s8_r0, aging_s12_r0
2025-10-31 16:12:12,117 [INFO]: WT_18: 3 samples → aging_s6_r0, aging_s8_r2, aging_s11_r0
2025-10-31 16:12:12,119 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [12]:
adata.obs["n_cells_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)[args.sample_key].transform("count")
adata.obs["volume_mean"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("mean")
adata.obs["volume_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("sum")

In [13]:
adata.X = adata.layers["counts"].copy()
assert np.issubdtype(adata.X.dtype, np.integer)

In [14]:
obs_to_keep = [
    args.condition_key,
    args.subset_key,
    args.sample_key,
    "n_cells_sum",
    "volume_mean",
    "volume_sum",
]
if args.batch_key and args.batch_key in adata.obs.columns:
    obs_to_keep.append(args.batch_key)
else:
    logger.warning(
        f"Batch column '{args.batch_key}' was not found in adata.obs "
        "or is invalid. Continuing **without** a batch covariate."
    )
    args.batch_key = None

In [15]:
logger.info("Run pseudobulking...")
total = len(groups_to_process)
logger.info(f"Processing {groups_to_process[0]} ({1}/{total})")
adata_pb = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=groups_to_process[0],
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
for i, group in enumerate(groups_to_process[1:], 2):
    logger.info(f"Processing {group} ({i}/{total})")
    adata_pb_i = pseudobulk_aggregate_and_filter(
        adata,
        subset_value=group,
        sample_key=args.sample_key,
        subset_key=args.subset_key,
        obs_to_keep=obs_to_keep,
        min_cells=args.min_cells,
        replicates_per_patient=args.replicates_per_patient,
        logger=logger,
    )
    adata_pb = ad.concat(
        [adata_pb, adata_pb_i], join="outer", label=None, index_unique=None
    )
adata_pb.obs_names_make_unique()
# Add whole brain: pseudobulk per sample across all groups
adata_pb_i = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=None,
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
adata_pb_i.obs[args.subset_key] = "all"
adata_pb = ad.concat([adata_pb, adata_pb_i])
del adata

2025-10-31 16:12:12,660 [INFO]: Run pseudobulking...
2025-10-31 16:12:12,661 [INFO]: Processing Astrocytes (1/18)
2025-10-31 16:12:13,804 [INFO]: 	Processing donor 1/15...
2025-10-31 16:12:16,377 [INFO]: 	Processing donor 15/15...
2025-10-31 16:12:16,544 [INFO]: Processing BAMs (2/18)
2025-10-31 16:12:16,756 [INFO]: 	Processing donor 1/14...
2025-10-31 16:12:18,352 [INFO]: 	Processing donor 14/14...
2025-10-31 16:12:18,480 [INFO]: Processing Choroid_Plexus (3/18)
2025-10-31 16:12:18,697 [INFO]: 	Processing donor 1/15...
2025-10-31 16:12:20,443 [INFO]: 	Processing donor 15/15...
2025-10-31 16:12:20,571 [INFO]: Processing ECs (4/18)
2025-10-31 16:12:21,373 [INFO]: 	Processing donor 1/15...
2025-10-31 16:12:23,632 [INFO]: 	Processing donor 15/15...
2025-10-31 16:12:23,918 [INFO]: Processing Ependymal (5/18)
2025-10-31 16:12:24,137 [INFO]: 	Processing donor 1/15...
2025-10-31 16:12:25,881 [INFO]: 	Processing donor 15/15...
2025-10-31 16:12:26,003 [INFO]: Processing Immune_Other (6/18)
2025

In [16]:
plot_pseudobulk_pca(adata_pb, args, output_dir, cell_type_colors, logger)

2025-10-31 16:13:18,044 [INFO]: Skipping PCA plot, file already exists: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/pseudobulk_pca.png


In [17]:
logger.info("Run DEA...")
adata_pb = prepare_adata_for_rpy2(adata_pb, key=args.subset_key)
adatas_pb = {}
unique_groups = adata_pb.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata_pb[adata_pb.obs[args.subset_key] == key].copy()
    adatas_pb[key] = tmp

2025-10-31 16:13:18,104 [INFO]: Run DEA...


In [18]:
adatas_pb

{'Astrocytes': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'BAMs': AnnData object with n_obs × n_vars = 14 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Choroid_Plexus': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'ECs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Ependymal': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Immune_Other': AnnData object with n_obs × n_vars = 5 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Microglia': AnnData object with n_obs × n_vars 

In [19]:
adatas_pb["ECs"].X.todense()

matrix([[1756., 7463., 2459., ..., 3440., 4956., 1449.],
        [ 416., 3015., 1207., ..., 1316., 2401., 1032.],
        [ 674., 3962., 1005., ..., 1467., 1977.,  538.],
        ...,
        [ 361., 1928., 1236., ...,  829., 2706., 1066.],
        [ 462., 2598.,  881., ...,  981., 2747., 1311.],
        [  99., 1251.,  545., ...,  321., 1171.,  908.]],
       shape=(15, 500), dtype=float32)

In [None]:
# loop over cell types

In [21]:
all_degs = {}
for group_i in adatas_pb:
    adata_tmp = adatas_pb[group_i]
    with localconverter(conv):
        combined_results = edgeR_loop(
            adata=adata_tmp,
            group_i=group_i,
            edger_methods=args.edger_methods,
            test_groups=args.test_groups,
            ref_group=args.ref,
            condition_col=args.condition_key,
            batch_col=args.batch_key,
        )
    if combined_results is not ro.NULL:
        all_degs[group_i] = combined_results

2025-10-29 15:36:04,571 [INFO]: R: Astrocytes: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:04,660 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:10,224 [INFO]: R: BAMs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:10,232 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:15,957 [INFO]: R: Choroid_Plexus: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:15,965 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:20,103 [INFO]: R: ECs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:20,110 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:24,053 [INFO]: R: Ependymal: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:24,061 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:28,219 [INFO]: R: Immune_Other: skip (ref_group/test_groups missing)
2025-10-29 15:36:28,334 [INFO]: R: Microglia: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:28,340 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:32,230 [INFO]: R: Neurons_Dopa: WT_6/WT_12/WT_18/WT_24 vs WT_3

In [80]:
# format and export

In [81]:
logger.info("Format output table...")
collapsed_df = pd.concat(
    [pd.DataFrame(v).assign(subset=k) for k, v in all_degs.items()], ignore_index=True
).drop(columns=["result_id"], errors="ignore")

collapsed_df = add_group_sample_counts(
    collapsed_df,
    adatas_pb,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset",
)
collapsed_df = collapsed_df.set_index("gene")
collapsed_df.insert(collapsed_df.columns.get_loc("FC"), "gene", collapsed_df.index)

order = [
    "subset",
    "gene",
    "FC",
    "logFC",
    "PValue",
    "FDR",
    "logCPM",
    "LR",
    "method",
    "test_group",
    "ref",
    "test",
]
collapsed_df = collapsed_df[order + [c for c in collapsed_df.columns if c not in order]]

logger.info("Add ensembl gene ids...")
collapsed_df = add_ensembl_id(collapsed_df, logger=logger)

2025-10-31 16:23:30,497 [INFO]: Format output table...
2025-10-31 16:23:30,545 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['H2afj', 'Pifo', 'Ctps']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-10-31 16:23:34,067 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-10-31 16:23:34,069 [INFO]: Renamed aliases: H2afj->H2aj, Pifo->Cimap3, Ctps->Ctps1


In [82]:
collapsed_df

Unnamed: 0_level_0,subset,gene,FC,logFC,PValue,FDR,logCPM,LR,method,test_group,ref,test,test_group_n,ref_n,ensembl_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Rspo1,ECs,Rspo1,0.293731,-1.767434,0.000013,0.005180,7.807830,19.002640,edgeR_LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000028871
Cdkn1a,ECs,Cdkn1a,0.262400,-1.930162,0.000036,0.005180,9.824937,17.067461,edgeR_LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000023067
Gata3,ECs,Gata3,4.540910,2.182982,0.000037,0.005180,6.708165,17.000073,edgeR_LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000015619
Spp1,ECs,Spp1,3.722140,1.896132,0.000041,0.005180,8.567996,16.804434,edgeR_LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000029304
Mgp,ECs,Mgp,3.313243,1.728244,0.000147,0.014732,8.531981,14.406213,edgeR_LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000030218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Folr1,ECs,Folr1,0.990512,-0.013754,0.962358,0.970119,5.414683,0.002227,edgeR_LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000001827
Mag,ECs,Mag,1.005573,0.008018,0.974067,0.979947,8.655519,0.001057,edgeR_LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000036634
Itgb5,ECs,Itgb5,1.002281,0.003287,0.989200,0.991908,9.199109,0.000183,edgeR_LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000022817
Tjp3,ECs,Tjp3,1.001661,0.002394,0.991654,0.991908,7.567088,0.000109,edgeR_LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000034917


In [83]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r"[_-]", "", args.subset_key)

2025-10-31 16:23:42,138 [INFO]: Export table(s)...


In [100]:
name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR"
name

'aging-by-celltype_pseudobulk_edgeR'

In [101]:
# name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR_libnorm"
# name

'aging-by-celltype_pseudobulk_edgeR_libnorm'

In [104]:
for (method, test), df_mt in collapsed_df.groupby(["method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{method.split('_', 1)[-1]}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")

logger.info("Done.")

2025-10-31 16:31:53,965 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_12vsWT_3.xlsx
2025-10-31 16:31:54,064 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_18vsWT_3.xlsx
2025-10-31 16:31:54,162 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_24vsWT_3.xlsx
2025-10-31 16:31:54,265 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_6vsWT_3.xlsx
2025-10-31 16:31:54,266 [INFO]: Done.


In [20]:
# dissect loop manually

In [21]:
%load_ext rpy2.ipython

In [22]:
group_i = "ECs"
adata = adatas_pb[group_i]
condition_col = args.condition_key
test_groups = args.test_groups
ref_group = args.ref
batch_col = args.batch_key
edger_method = "LRT"

In [23]:
edger_method

'LRT'

In [24]:
batch_col

'slide'

In [25]:
condition_col

'condition'

In [26]:
test_groups

['WT_6', 'WT_12', 'WT_18', 'WT_24']

In [27]:
adata.obs.head()

Unnamed: 0,condition,cell_type,n_cells_sum,volume_mean,volume_sum,slide,sample
donor_aging_s1_r0_0-3,WT_6,ECs,3728,1552.182742,5786537.0,1,aging_s1_r0
donor_aging_s5_r1_0-3,WT_6,ECs,2798,1431.079209,4004160.0,5,aging_s5_r1
donor_aging_s5_r2_0-3,WT_12,ECs,3223,1503.27387,4845052.0,5,aging_s5_r2
donor_aging_s6_r0_0-3,WT_18,ECs,3349,1316.45675,4408814.0,6,aging_s6_r0
donor_aging_s7_r2_0-3,WT_6,ECs,3062,1276.856319,3909734.0,7,aging_s7_r2


In [28]:
#

In [29]:
%%R -c conv -i adata -i condition_col -i test_groups -i group_i -i ref_group -i batch_col -i edger_method -o design

condition <- droplevels(factor(colData(adata)[[condition_col]]))
condition <- stats::relevel(condition, ref = ref_group)

# use batch covariate if exists and has >1 level
use_batch <- !is.null(batch_col) && batch_col %in% colnames(colData(adata)) &&
             nlevels(droplevels(factor(colData(adata)[[batch_col]]))) > 1

design <- if (use_batch) {
  batch <- droplevels(factor(colData(adata)[[batch_col]]))
  model.matrix(~ condition + batch)
} else {
  model.matrix(~ condition)
}

message("  Design: ~ ", condition_col, if (use_batch) paste0(" + ", batch_col) else "")

rdof <- nrow(design) - qr(design)$rank
if (rdof <= 0) {
  message("  Skip: no residual df (likely too few samples or condition confounded with batch)")
  return(NULL)
}

  Design: ~ condition + slide


In [30]:
# compare lib size normalization with usual volume-based norm

In [31]:
%%R -o libsize -o vol_sum
libsize <- colSums(SummarizedExperiment::assay(adata, "X"))
vol_sum <- colData(adata)$volume_sum

In [32]:
np.corrcoef(libsize, vol_sum)[0, 1]

np.float64(0.7396053761788448)

In [33]:
%%R
min_count = 2
y <- edgeR::DGEList(assay(adata, "X"), group = condition)
keep <- edgeR::filterByExpr(y, min.count = min_count, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]

In [33]:
%%R
# replace library sizes with volume
vol <- colData(adata)[["volume_sum"]]
if (is.null(vol) || any(vol <= 0)) stop("Invalid volume in colData(sce), expects 'volume_sum'.")
y$samples$lib.size <- vol
y <- edgeR::calcNormFactors(y)
y$samples

                       group lib.size norm.factors
donor_aging_s1_r0_0-3   WT_6  5786537    1.6520609
donor_aging_s5_r1_0-3   WT_6  4004160    1.0459383
donor_aging_s5_r2_0-3  WT_12  4845052    0.7929747
donor_aging_s6_r0_0-3  WT_18  4408814    1.0758775
donor_aging_s7_r2_0-3   WT_6  3909734    0.9092402
donor_aging_s8_r0_0-3  WT_12  4560863    1.2528785
donor_aging_s8_r1_0-3   WT_3  5041294    1.3993642
donor_aging_s8_r2_0-3  WT_18  5789520    1.0354382
donor_aging_s10_r0_0-3 WT_24  4848763    0.8257560
donor_aging_s10_r1_0-3  WT_3  4507851    1.2900732
donor_aging_s10_r2_0-3  WT_3  5734632    1.3349858
donor_aging_s11_r0_0-3 WT_18  4025104    0.9739872
donor_aging_s11_r1_0-3 WT_24  4965043    0.7945015
donor_aging_s11_r2_0-3 WT_24  5106148    0.7781083
donor_aging_s12_r0_0-2 WT_12  3817003    0.4799239


In [34]:
# OR

In [35]:
%%R
y <- edgeR::calcNormFactors(y, method="TMM")
y$samples

                       group lib.size norm.factors
donor_aging_s1_r0_0-3   WT_6  1743002    1.0336947
donor_aging_s5_r1_0-3   WT_6   783658    1.0010397
donor_aging_s5_r2_0-3  WT_12   725441    0.9932803
donor_aging_s6_r0_0-3  WT_18   842409    1.0799021
donor_aging_s7_r2_0-3   WT_6   706624    0.9302237
donor_aging_s8_r0_0-3  WT_12  1047030    1.0292272
donor_aging_s8_r1_0-3   WT_3  1413525    0.9576344
donor_aging_s8_r2_0-3  WT_18  1130092    0.9948433
donor_aging_s10_r0_0-3 WT_24   787150    0.9669853
donor_aging_s10_r1_0-3  WT_3  1179334    0.9603020
donor_aging_s10_r2_0-3  WT_3  1433232    1.0382107
donor_aging_s11_r0_0-3 WT_18   727508    1.0331344
donor_aging_s11_r1_0-3 WT_24   752044    1.0252157
donor_aging_s11_r2_0-3 WT_24   797327    0.9509489
donor_aging_s12_r0_0-2 WT_12   355197    1.0171961


In [36]:
%%R
fit <- if (edger_method == "QL") {
  edgeR::glmQLFit(edgeR::estimateDisp(y, design), design)
} else if (edger_method == "LRT") {
  edgeR::glmFit(edgeR::estimateGLMRobustDisp(y, design), design)
}

In [37]:
%%R
condition <- droplevels(factor(colData(adata)[[condition_col]]))
if (is.null(condition) || nlevels(condition) < 2) { message(group_i, ": skip (condition invalid or <2 levels)"); return(NULL) }
present <- intersect(test_groups, levels(condition))
if (!(ref_group %in% levels(condition)) || !length(present)) { message(group_i, ": skip (ref_group/test_groups missing)"); return(NULL) }
missing <- setdiff(test_groups, present)
message(group_i, ": ", paste(present, collapse="/"), " vs ", ref_group,
        if (length(missing)) paste0(" (missing:", paste(missing, collapse=","), ")") else "")

ECs: WT_6/WT_12/WT_18/WT_24 vs WT_3


In [38]:
%%R
present

[[1]]
[1] "WT_6"

[[2]]
[1] "WT_12"

[[3]]
[1] "WT_18"

[[4]]
[1] "WT_24"



In [59]:
%%R
res <- list()
for (tg in present) {
  tmp <- edgeR_run_test(fit, design, edger_method, tg, ref_group)
  if (is.null(tmp)) next
  tmp <- dplyr::mutate(as.data.frame(tmp), FC = 2^logFC, edgeR_method = edger_method, test_group = tg, ref = ref_group)
  res[[paste(tg, edger_method, sep="_")]] <- tmp
}

In [74]:
%%R
res <- list()
for (tg in present) {
  tmp <- edgeR_run_test(fit, design, edger_method, tg, ref_group)
  if (is.null(tmp)) next
  met <- paste0("edgeR_", edger_method)
  tmp <- dplyr::mutate(as.data.frame(tmp), FC = 2^logFC, method = met, test_group = tg, ref = ref_group)
  res[[paste(tg, met, sep="_")]] <- tmp
}

In [75]:
%%R
head(res)

$WT_6_edgeR_LRT
                      logFC    logCPM           LR       PValue         FDR
Rspo1         -1.767434e+00  7.807830 1.900264e+01 1.305377e-05 0.005179547
Cdkn1a        -1.930162e+00  9.824937 1.706746e+01 3.607513e-05 0.005179547
Gata3          2.182982e+00  6.708165 1.700007e+01 3.737839e-05 0.005179547
Spp1           1.896132e+00  8.567996 1.680443e+01 4.143637e-05 0.005179547
Mgp            1.728244e+00  8.531981 1.440621e+01 1.473154e-04 0.014731543
Cx3cl1        -1.246154e+00 12.069438 1.369450e+01 2.150837e-04 0.017923644
Arpp21        -1.614785e+00 11.579817 1.310281e+01 2.948529e-04 0.021060920
Acta2          1.856266e+00  7.289953 1.274139e+01 3.576526e-04 0.022353288
Drd1          -1.793780e+00  8.071460 1.206626e+01 5.134265e-04 0.028523696
Adora2a       -1.921533e+00  8.412108 1.130381e+01 7.734838e-04 0.038674191
Lamp5         -1.983745e+00 10.533813 1.089927e+01 9.620227e-04 0.043728305
AI593442      -1.121516e+00 11.798480 8.082912e+00 4.468459e-03 0.172692

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 9.636945e-01
Bak1          -0.0689881590  8.416449 5.937512e-02 8.074863e-01 9.636945e-01
Ackr1          0.0643700846  8.261611 5.723255e-02 8.109247e-01 9.636945e-01
Tnfrsf1a      -0.0764501566  9.925250 5.692568e-02 8.114227e-01 9.636945e-01
Bmx            0.0802979806  8.944002 5.692071e-02 8.114308e-01 9.636945e-01
Mapk9          0.0572085948 12.284671 5.536666e-02 8.139749e-01 9.644253e-01
Plpp3         -0.0550323004 12.951730 5.226662e-02 8.191651e-01 9.675227e-01
Mfge8          0.0655146820 11.082404 5.105143e-02 8.212437e-01 9.675227e-01
Rrm2          -0.0633453562  7.688269 4.796669e-02 8.266399e-01 9.675227e-01
Tjp3           0.0627719960  7.567088 4.761845e-02 8.272604e-01 9.675227e-01
Opalin         0.0569375218  8.266331 4.680764e-02 8.287144e-01 9.675227e-01
Lama4          0.0596160706 10.673636 4.649156e-02 8.292847e-01 9.675227e-01
Meox1          0.0557597607  7.884247 4.602271e-02 8.301345e-01 9.675227e-01
Podxl         -0.0474697168 13.900456 4.312141e-02 8.354968e-0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 edgeR_LRT      WT_12 WT_3
Mbp           WT_12vsWT_3 1.0335456 edgeR_LRT      WT_12 WT_3
Nefl          WT_12vsWT_3 1.0335886 edgeR_LRT      WT_12 WT_3
Gatm          WT_12vsWT_3 0.9695329 edgeR_LRT      WT_12 WT_3
Ptprc         WT_12vsWT_3 1.0307010 edgeR_LRT      WT_12 WT_3
Ccnd1         WT_12vsWT_3 0.9723832 edgeR_LRT      WT_12 WT_3
Cnn1          WT_12vsWT_3 1.0313708 edgeR_LRT      WT_12 WT_3
Ulk4          WT_12vsWT_3 0.9687396 edgeR_LRT      WT_12 WT_3
Itga11        WT_12vsWT_3 1.0331522 edgeR_LRT      WT_12 WT_3
Mlkl          WT_12vsWT_3 1.0295957 edgeR_LRT      WT_12 WT_3
Acan          WT_12vsWT_3 1.0323386 edgeR_LRT      WT_12 WT_3
Snta1         WT_12vsWT_3 1.0297570 edgeR_LRT      WT_12 WT_3
Ctnna2        WT_12vsWT_3 0.9717552 edgeR_LRT      WT_12 WT_3
Atp13a5       WT_12vsWT_3 0.9731376 edgeR_LRT      WT_12 WT_3
Ier3          WT_12vsWT_3 1.0305369 edgeR_LRT      WT_12 WT_3
Csf1r         WT_12vsWT_3 0.9649054 edgeR_LRT      WT_12 WT_3
Vip           WT_12vsWT_3 0.9634452 edgeR_L

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 0.6500623 edgeR_LRT      WT_24 WT_3
Col4a1        WT_24vsWT_3 0.6891564 edgeR_LRT      WT_24 WT_3
Ncan          WT_24vsWT_3 0.5938012 edgeR_LRT      WT_24 WT_3
Sspo          WT_24vsWT_3 1.5917357 edgeR_LRT      WT_24 WT_3
Cldn5         WT_24vsWT_3 0.6674270 edgeR_LRT      WT_24 WT_3
Itga4         WT_24vsWT_3 0.6340054 edgeR_LRT      WT_24 WT_3
Foxf2         WT_24vsWT_3 0.6779752 edgeR_LRT      WT_24 WT_3
Mbp           WT_24vsWT_3 1.4424513 edgeR_LRT      WT_24 WT_3
Ctnnd2        WT_24vsWT_3 0.6034905 edgeR_LRT      WT_24 WT_3
Abcg2         WT_24vsWT_3 0.6868876 edgeR_LRT      WT_24 WT_3
Aqp1          WT_24vsWT_3 1.6315218 edgeR_LRT      WT_24 WT_3
Mfsd2a        WT_24vsWT_3 0.6294768 edgeR_LRT      WT_24 WT_3
Ier3          WT_24vsWT_3 1.6004186 edgeR_LRT      WT_24 WT_3
H2-Aa         WT_24vsWT_3 1.8994409 edgeR_LRT      WT_24 WT_3
Bak1          WT_24vsWT_3 0.6571646 edgeR_LRT      WT_24 WT_3
Depp1         WT_24vsWT_3 1.5255483 edgeR_LRT      WT_24 WT_3
Slc38a5       WT_24vsWT_3 0.60131

In [76]:
%%R -o out # noqa: F821
if (!length(res)) return(NULL)
out <- dplyr::bind_rows(res, .id="result_id")
out$gene <- sub("[.][.][.].*", "", rownames(out)); rownames(out) <- NULL
out <- dplyr::select(out, gene, FC, dplyr::everything())

In [77]:
out  # noqa: F821

Unnamed: 0,gene,FC,result_id,logFC,logCPM,LR,PValue,FDR,test,method,test_group,ref
1,Rspo1,0.293731,WT_6_edgeR_LRT,-1.767434,7.807830,19.002640,0.000013,0.005180,WT_6vsWT_3,edgeR_LRT,WT_6,WT_3
2,Cdkn1a,0.262400,WT_6_edgeR_LRT,-1.930162,9.824937,17.067461,0.000036,0.005180,WT_6vsWT_3,edgeR_LRT,WT_6,WT_3
3,Gata3,4.540910,WT_6_edgeR_LRT,2.182982,6.708165,17.000073,0.000037,0.005180,WT_6vsWT_3,edgeR_LRT,WT_6,WT_3
4,Spp1,3.722140,WT_6_edgeR_LRT,1.896132,8.567996,16.804434,0.000041,0.005180,WT_6vsWT_3,edgeR_LRT,WT_6,WT_3
5,Mgp,3.313243,WT_6_edgeR_LRT,1.728244,8.531981,14.406213,0.000147,0.014732,WT_6vsWT_3,edgeR_LRT,WT_6,WT_3
...,...,...,...,...,...,...,...,...,...,...,...,...
1996,Folr1,0.990512,WT_24_edgeR_LRT,-0.013754,5.414683,0.002227,0.962358,0.970119,WT_24vsWT_3,edgeR_LRT,WT_24,WT_3
1997,Mag,1.005573,WT_24_edgeR_LRT,0.008018,8.655519,0.001057,0.974067,0.979947,WT_24vsWT_3,edgeR_LRT,WT_24,WT_3
1998,Itgb5,1.002281,WT_24_edgeR_LRT,0.003287,9.199109,0.000183,0.989200,0.991908,WT_24vsWT_3,edgeR_LRT,WT_24,WT_3
1999,Tjp3,1.001661,WT_24_edgeR_LRT,0.002394,7.567088,0.000109,0.991654,0.991908,WT_24vsWT_3,edgeR_LRT,WT_24,WT_3


In [78]:
group_i

'ECs'

In [79]:
all_degs = {}
if out is not ro.NULL:
    all_degs[group_i] = out

In [56]:
# end