In [1]:
# ruff: noqa
import argparse
import logging
import re
import sys
import warnings
from pathlib import Path

import anndata as ad
import anndata2ri
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import scanpy as sc
from rpy2.rinterface_lib.embedded import RRuntimeError
from rpy2.robjects.conversion import localconverter

plt.rcParams["font.family"] = (
    "Arial" if "Arial" in [f.name for f in fm.fontManager.ttflist] else "sans-serif"
)
plt.rcParams["font.weight"] = "normal"

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")
from cellseg_benchmark._constants import cell_type_colors
from cellseg_benchmark.adata_utils import plot_pseudobulk_pca
from cellseg_benchmark.dea_utils import (
    add_ensembl_id,
    add_group_sample_counts,
    prepare_adata_for_rpy2,
    pseudobulk_aggregate_and_filter,
    safe_sheet,
)

warnings.filterwarnings("ignore", message=".*Observation names are not unique*")
VALID_METHODS = {"LRT", "QL"}


def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(description="Run edgeR pseudobulk-based DEA")
    p.add_argument("cohort", help="Cohort name, e.g. foxf2")
    p.add_argument(
        "seg_method", help="Segmentation method, e.g. Cellpose_1_nuclei_model"
    )
    p.add_argument(
        "--sample_key", default="sample", help="obs column for donor/sample ID"
    )
    p.add_argument(
        "--subset_key",
        default="cell_type",
        help="obs column used to subset data (e.g. cell_type, cluster, region)",
    )
    p.add_argument(
        "--subset_values",
        nargs="+",
        default=None,
        help="Values of subset_key to process (default: all unique values)",
    )
    p.add_argument(
        "--condition_key",
        default="genotype",
        help="obs column for condition (e.g. genotype)",
    )
    p.add_argument(
        "--batch_key",
        default="slide",
        help="optional batch key for inclusion as covariate (default: slide)",
    )
    p.add_argument("--ref", default="WT", help="Reference group (default: WT)")
    p.add_argument(
        "--test_groups",
        nargs="+",
        default=None,
        help="Groups to test vs reference (default: all groups in condition_key except --ref)",
    )
    p.add_argument(
        "--edger_methods",
        nargs="+",
        default=["LRT"],
        choices=sorted(VALID_METHODS),
        help="edgeR method(s) to run (default: LRT)",
    )
    p.add_argument("--min_cells", type=int, default=15, help="Minimum cells per donor")
    p.add_argument(
        "--replicates_per_patient",
        type=int,
        default=1,
        help="Number of pseudoreplicates per donor",
    )
    p.add_argument(
        "--overwrite",
        type=lambda x: str(x).lower() in ["true", "1", "yes"],
        default=True,
        help="Overwrite existing result files (default: True)",
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [2]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "foxf2",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "genotype",
        "--ref",
        "WT",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [3]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "aging",
        "Cellpose_1_nuclei_model",
        "--sample_key",
        "sample",
        "--subset_key",
        "cell_type",
        "--condition_key",
        "condition",
        "--ref",
        "WT_3",
        "--edger_methods",
        "LRT",
        "--overwrite",
        "False",
    ]
)

In [4]:
args

Namespace(cohort='aging', seg_method='Cellpose_1_nuclei_model', sample_key='sample', subset_key='cell_type', subset_values=None, condition_key='condition', batch_key='slide', ref='WT_3', test_groups=None, edger_methods=['LRT'], min_cells=15, replicates_per_patient=1, overwrite=False)

In [5]:
# Logger setup
logger = logging.getLogger("dea")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    logger.propagate = False
rcb.logger.handlers = logger.handlers
#rcb.consolewrite_print = lambda x: logger.debug(f"R: {x.strip()}")
rcb.consolewrite_error = lambda x: (_ for _ in ()).throw(RRuntimeError(x.strip()))
#rcb.consolewrite_message = lambda x: logger.info(f"R: {x.strip()}")
rcb.consolewrite_warn = lambda x: (
    logger.warning if x.lstrip().lower().startswith("warning") else logger.info
)(f"R: {x.strip()}")
setattr(rcb, "consolewrite_warnerror", rcb.consolewrite_warn)

In [6]:
# R setup
conv = ro.default_converter + ro.pandas2ri.converter + anndata2ri.converter
r_script = Path(sys.path[1]) / "cellseg_benchmark" / "dea_utils.r"
ro.r["source"](str(r_script))
edgeR_loop = ro.globalenv["edgeR_loop"]

In [7]:
base_path = Path("/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark")
method_path = base_path / "analysis" / args.cohort / args.seg_method
output_dir = method_path / "dea"
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
logger.info("Loading integrated AnnData...")
adata = sc.read_h5ad(method_path / "adatas" / "adata_integrated.h5ad.gz")
adata.obs["cell_type"] = adata.obs["cell_type_revised"]

2025-10-31 13:54:07,054 [INFO]: Loading integrated AnnData...


In [9]:
# re-group cell types
adata.obs["cell_type"] = (
    adata.obs["cell_type"]
    .astype(str)
    .replace(
        {
            "Tanycytes": "Ependymal",
            "Astroependymal": "Astrocytes",
            "Neurons-Glyc-Gaba": "Neurons-Other",
        }
    )
    .astype("category")
)

In [10]:
adata.obs["cell_type"].value_counts()

cell_type
Oligodendrocytes            131904
Neurons-Glut                124130
Neurons-Dopa                 77224
Astrocytes                   75424
ECs                          49431
Neurons-Granule-Immature     42337
Neurons-Other                40619
Undefined                    32012
Microglia                    29106
OPCs                         20832
Pericytes                    15056
VLMCs                         8346
Ependymal                     6539
SMCs                          5928
Choroid-Plexus                5657
Neurons-Gaba                  3499
BAMs                          1933
Immune-Other                   566
Name: count, dtype: int64

In [11]:
# Clean up group names for R conversion
adata.obs[args.subset_key] = [
    key.replace(" ", "_")
    .replace("-", "_")
    .replace("/", "_")
    .replace("+", "")
    .replace("(", "")
    .replace(")", "")
    if not pd.isna(key)
    else key
    for key in adata.obs[args.subset_key]
]
for col in [args.condition_key, args.sample_key, args.subset_key]:
    adata.obs[col] = adata.obs[col].astype("category")
if args.test_groups is None:
    groups = adata.obs[args.condition_key].dropna().unique().tolist()
    args.test_groups = [g for g in groups if g != args.ref]
    msg = (
        f"Test groups inferred from '{args.condition_key}': {', '.join(map(str, args.test_groups))}"
        if args.test_groups
        else f"No test groups found different from ref '{args.ref}' under '{args.condition_key}'."
    )
    logger.info(msg)

# print sample overview
if args.subset_values is None:
    groups_to_process = sorted(adata.obs[args.subset_key].unique())
else:
    groups_to_process = args.subset_values
for cond, df in adata.obs.groupby(args.condition_key, observed=True):
    samples = df[args.sample_key].unique()
    logger.info(f"{cond}: {len(samples)} samples → {', '.join(samples)}")

2025-10-31 13:54:52,246 [INFO]: Test groups inferred from 'condition': WT_6, WT_12, WT_18, WT_24
2025-10-31 13:54:52,345 [INFO]: WT_3: 3 samples → aging_s8_r1, aging_s10_r1, aging_s10_r2
2025-10-31 13:54:52,348 [INFO]: WT_6: 3 samples → aging_s1_r0, aging_s5_r1, aging_s7_r2
2025-10-31 13:54:52,350 [INFO]: WT_12: 3 samples → aging_s5_r2, aging_s8_r0, aging_s12_r0
2025-10-31 13:54:52,352 [INFO]: WT_18: 3 samples → aging_s6_r0, aging_s8_r2, aging_s11_r0
2025-10-31 13:54:52,353 [INFO]: WT_24: 3 samples → aging_s10_r0, aging_s11_r1, aging_s11_r2


In [12]:
adata.obs["n_cells_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)[args.sample_key].transform("count")
adata.obs["volume_mean"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("mean")
adata.obs["volume_sum"] = adata.obs.groupby(
    [args.subset_key, args.sample_key], observed=True
)["volume_final"].transform("sum")

In [13]:
adata.X = adata.layers["counts"].copy()
assert np.issubdtype(adata.X.dtype, np.integer)

In [14]:
obs_to_keep = [
    args.condition_key,
    args.subset_key,
    args.sample_key,
    "n_cells_sum",
    "volume_mean",
    "volume_sum",
]
if args.batch_key and args.batch_key in adata.obs.columns:
    obs_to_keep.append(args.batch_key)
else:
    logger.warning(
        f"Batch column '{args.batch_key}' was not found in adata.obs "
        "or is invalid. Continuing **without** a batch covariate."
    )
    args.batch_key = None

In [15]:
logger.info("Run pseudobulking...")
total = len(groups_to_process)
logger.info(f"Processing {groups_to_process[0]} ({1}/{total})")
adata_pb = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=groups_to_process[0],
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
for i, group in enumerate(groups_to_process[1:], 2):
    logger.info(f"Processing {group} ({i}/{total})")
    adata_pb_i = pseudobulk_aggregate_and_filter(
        adata,
        subset_value=group,
        sample_key=args.sample_key,
        subset_key=args.subset_key,
        obs_to_keep=obs_to_keep,
        min_cells=args.min_cells,
        replicates_per_patient=args.replicates_per_patient,
        logger=logger,
    )
    adata_pb = ad.concat(
        [adata_pb, adata_pb_i], join="outer", label=None, index_unique=None
    )
adata_pb.obs_names_make_unique()
# Add whole brain: pseudobulk per sample across all groups
adata_pb_i = pseudobulk_aggregate_and_filter(
    adata,
    subset_value=None,
    sample_key=args.sample_key,
    subset_key=args.subset_key,
    obs_to_keep=obs_to_keep,
    min_cells=args.min_cells,
    replicates_per_patient=args.replicates_per_patient,
    logger=logger,
)
adata_pb_i.obs[args.subset_key] = "all"
adata_pb = ad.concat([adata_pb, adata_pb_i])
del adata

2025-10-31 13:54:52,915 [INFO]: Run pseudobulking...
2025-10-31 13:54:52,917 [INFO]: Processing Astrocytes (1/18)
2025-10-31 13:54:54,097 [INFO]: 	Processing donor 1/15...
2025-10-31 13:54:56,582 [INFO]: 	Processing donor 15/15...
2025-10-31 13:54:56,741 [INFO]: Processing BAMs (2/18)
2025-10-31 13:54:56,951 [INFO]: 	Processing donor 1/14...
2025-10-31 13:54:58,544 [INFO]: 	Processing donor 14/14...
2025-10-31 13:54:58,666 [INFO]: Processing Choroid_Plexus (3/18)
2025-10-31 13:54:58,861 [INFO]: 	Processing donor 1/15...
2025-10-31 13:55:00,565 [INFO]: 	Processing donor 15/15...
2025-10-31 13:55:00,687 [INFO]: Processing ECs (4/18)
2025-10-31 13:55:01,517 [INFO]: 	Processing donor 1/15...
2025-10-31 13:55:03,768 [INFO]: 	Processing donor 15/15...
2025-10-31 13:55:04,052 [INFO]: Processing Ependymal (5/18)
2025-10-31 13:55:04,260 [INFO]: 	Processing donor 1/15...
2025-10-31 13:55:05,988 [INFO]: 	Processing donor 15/15...
2025-10-31 13:55:06,108 [INFO]: Processing Immune_Other (6/18)
2025

In [16]:
plot_pseudobulk_pca(adata_pb, args, output_dir, cell_type_colors, logger)

2025-10-31 13:56:08,969 [INFO]: Skipping PCA plot, file already exists: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/pseudobulk_pca.png


In [17]:
logger.info("Run DEA...")
adata_pb = prepare_adata_for_rpy2(adata_pb, key=args.subset_key)
adatas_pb = {}
unique_groups = adata_pb.obs[args.subset_key].unique().tolist()
for key in unique_groups:
    tmp = adata_pb[adata_pb.obs[args.subset_key] == key].copy()
    adatas_pb[key] = tmp

2025-10-31 13:56:09,112 [INFO]: Run DEA...


In [18]:
adatas_pb

{'Astrocytes': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'BAMs': AnnData object with n_obs × n_vars = 14 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Choroid_Plexus': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'ECs': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Ependymal': AnnData object with n_obs × n_vars = 15 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Immune_Other': AnnData object with n_obs × n_vars = 5 × 500
     obs: 'condition', 'cell_type', 'n_cells_sum', 'volume_mean', 'volume_sum', 'slide', 'sample',
 'Microglia': AnnData object with n_obs × n_vars 

In [19]:
adatas_pb["ECs"].X.todense()

matrix([[1756., 7463., 2459., ..., 3440., 4956., 1449.],
        [ 416., 3015., 1207., ..., 1316., 2401., 1032.],
        [ 674., 3962., 1005., ..., 1467., 1977.,  538.],
        ...,
        [ 361., 1928., 1236., ...,  829., 2706., 1066.],
        [ 462., 2598.,  881., ...,  981., 2747., 1311.],
        [  99., 1251.,  545., ...,  321., 1171.,  908.]],
       shape=(15, 500), dtype=float32)

In [None]:
# loop over cell types

In [21]:
all_degs = {}
for group_i in adatas_pb:
    adata_tmp = adatas_pb[group_i]
    with localconverter(conv):
        combined_results = edgeR_loop(
            adata=adata_tmp,
            group_i=group_i,
            edger_methods=args.edger_methods,
            test_groups=args.test_groups,
            ref_group=args.ref,
            condition_col=args.condition_key,
            batch_col=args.batch_key,
        )
    if combined_results is not ro.NULL:
        all_degs[group_i] = combined_results

2025-10-29 15:36:04,571 [INFO]: R: Astrocytes: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:04,660 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:10,224 [INFO]: R: BAMs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:10,232 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:15,957 [INFO]: R: Choroid_Plexus: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:15,965 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:20,103 [INFO]: R: ECs: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:20,110 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:24,053 [INFO]: R: Ependymal: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:24,061 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:28,219 [INFO]: R: Immune_Other: skip (ref_group/test_groups missing)
2025-10-29 15:36:28,334 [INFO]: R: Microglia: WT_6/WT_12/WT_18/WT_24 vs WT_3
2025-10-29 15:36:28,340 [INFO]: R: Design: ~ condition + slide
2025-10-29 15:36:32,230 [INFO]: R: Neurons_Dopa: WT_6/WT_12/WT_18/WT_24 vs WT_3

In [None]:
# format and export

In [45]:
logger.info("Format output table...")
collapsed_df = pd.concat(
    [pd.DataFrame(v).assign(subset=k) for k, v in all_degs.items()], ignore_index=True
).drop(columns=["result_id"], errors="ignore")

collapsed_df = add_group_sample_counts(
    collapsed_df,
    adatas_pb,
    condition_key=args.condition_key,
    sample_key=args.sample_key,
    ref=args.ref,
    test_groups=args.test_groups,
    subset_group="subset",
)
collapsed_df = collapsed_df.set_index("gene")
collapsed_df.insert(collapsed_df.columns.get_loc("FC"), "gene", collapsed_df.index)

order = [
    "subset",
    "gene",
    "FC",
    "logFC",
    "PValue",
    "FDR",
    "logCPM",
    "LR",
    "edgeR_method",
    "test_group",
    "ref",
    "test",
]
collapsed_df = collapsed_df[order + [c for c in collapsed_df.columns if c not in order]]

logger.info("Add ensembl gene ids...")
collapsed_df = add_ensembl_id(collapsed_df, logger=logger)

2025-10-31 13:44:03,599 [INFO]: Format output table...
2025-10-31 13:44:03,650 [INFO]: Add ensembl gene ids...
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
3 input query terms found no hit:	['H2afj', 'Pifo', 'Ctps']
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2025-10-31 13:44:07,343 [INFO]: 500 unique symbols -> 500 mapped (100.0%)
2025-10-31 13:44:07,345 [INFO]: Renamed aliases: H2afj->H2aj, Pifo->Cimap3, Ctps->Ctps1


In [46]:
collapsed_df

Unnamed: 0_level_0,subset,gene,FC,logFC,PValue,FDR,logCPM,LR,edgeR_method,test_group,ref,test,test_group_n,ref_n,ensembl_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Rspo1,ECs,Rspo1,0.287876,-1.796480,0.000010,0.004887,5.416194,19.555233,LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000028871
Cdkn1a,ECs,Cdkn1a,0.257176,-1.959172,0.000024,0.006035,7.433052,17.831155,LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000023067
Gata3,ECs,Gata3,4.451607,2.154326,0.000049,0.007130,4.316713,16.477494,LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000015619
Spp1,ECs,Spp1,3.649128,1.867552,0.000057,0.007130,6.179765,16.198348,LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000029304
Cx3cl1,ECs,Cx3cl1,0.413188,-1.275128,0.000125,0.012484,9.679503,14.718062,LRT,WT_6,WT_3,WT_6vsWT_3,3,3,ENSMUSG00000031778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pdgfc,ECs,Pdgfc,0.998429,-0.002268,0.991263,0.997057,5.428233,0.000120,LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000028019
Cfp,ECs,Cfp,0.998099,-0.002745,0.991505,0.997057,4.526584,0.000113,LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000001128
Enpp6,ECs,Enpp6,1.001537,0.002216,0.993435,0.997057,4.665809,0.000068,LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000038173
Pifo,ECs,Cimap3,1.001079,0.001556,0.995063,0.997057,3.364486,0.000038,LRT,WT_24,WT_3,WT_24vsWT_3,3,3,ENSMUSG00000010136


In [47]:
logger.info("Export table(s)...")
subset_key_clean = re.sub(r"[_-]", "", args.subset_key)

2025-10-31 13:44:35,268 [INFO]: Export table(s)...


In [49]:
name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR"
name

'aging-by-celltype_pseudobulk_edgeR'

In [50]:
#name = f"{args.cohort}-by-{subset_key_clean}_pseudobulk_edgeR_libnorm"

In [51]:
for (edgeR_method, test), df_mt in collapsed_df.groupby(["edgeR_method", "test"]):
    df_mt = df_mt.loc[:, df_mt.notna().any()]  # drop all-NaN cols
    xlsx = output_dir / f"{name}_{edgeR_method}_{test}.xlsx"
    if xlsx.exists() and not args.overwrite:
        logger.info(f"  Exists, skip: {xlsx}")
        continue
    used = set()
    with pd.ExcelWriter(xlsx, engine="xlsxwriter") as writer:
        for gid, g in df_mt.groupby("subset", sort=True):
            g.sort_values("PValue").to_excel(
                writer, sheet_name=safe_sheet(gid, used), index=False
            )
    logger.info(f"  Wrote: {xlsx}")

logger.info("Done.")

2025-10-31 13:44:54,347 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_12vsWT_3.xlsx
2025-10-31 13:44:54,436 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_18vsWT_3.xlsx
2025-10-31 13:44:54,526 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_24vsWT_3.xlsx
2025-10-31 13:44:54,617 [INFO]:   Wrote: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/analysis/aging/Cellpose_1_nuclei_model/dea/aging-by-celltype_pseudobulk_edgeR_libnorm_LRT_WT_6vsWT_3.xlsx
2025-10-31 13:44:54,619 [INFO]: Done.


In [22]:
# dissect loop manually

In [20]:
%load_ext rpy2.ipython

In [21]:
group_i = "ECs"
adata = adatas_pb[group_i]
condition_col = args.condition_key
test_groups = args.test_groups
ref_group = args.ref
batch_col = args.batch_key
edger_methods = args.edger_methods

In [22]:
edger_methods

['LRT']

In [23]:
batch_col

'slide'

In [24]:
condition_col

'condition'

In [25]:
test_groups

['WT_6', 'WT_12', 'WT_18', 'WT_24']

In [26]:
adata.obs.head()

Unnamed: 0,condition,cell_type,n_cells_sum,volume_mean,volume_sum,slide,sample
donor_aging_s1_r0_0-3,WT_6,ECs,3728,1552.182742,5786537.0,1,aging_s1_r0
donor_aging_s5_r1_0-3,WT_6,ECs,2798,1431.079209,4004160.0,5,aging_s5_r1
donor_aging_s5_r2_0-3,WT_12,ECs,3223,1503.27387,4845052.0,5,aging_s5_r2
donor_aging_s6_r0_0-3,WT_18,ECs,3349,1316.45675,4408814.0,6,aging_s6_r0
donor_aging_s7_r2_0-3,WT_6,ECs,3062,1276.856319,3909734.0,7,aging_s7_r2


In [27]:
#

In [28]:
%%R -c conv -i adata -i condition_col -i test_groups -i group_i -i ref_group -i batch_col -i edger_methods -o design

condition <- droplevels(factor(colData(adata)[[condition_col]]))
condition <- stats::relevel(condition, ref = ref_group)

# use batch covariate if exists and has >1 level
use_batch <- !is.null(batch_col) && batch_col %in% colnames(colData(adata)) &&
             nlevels(droplevels(factor(colData(adata)[[batch_col]]))) > 1

design <- if (use_batch) {
  batch <- droplevels(factor(colData(adata)[[batch_col]]))
  model.matrix(~ condition + batch)
} else {
  model.matrix(~ condition)
}

message("  Design: ~ ", condition_col, if (use_batch) paste0(" + ", batch_col) else "")

rdof <- nrow(design) - qr(design)$rank
if (rdof <= 0) {
  message("  Skip: no residual df (likely too few samples or condition confounded with batch)")
  return(NULL)
}

  Design: ~ condition + slide


In [29]:
# test lib size normalization instead of usual volume-based norm

In [30]:
%%R -o libsize -o vol_sum
libsize <- colSums(SummarizedExperiment::assay(adata, "X"))
vol_sum <- colData(adata)$volume_sum

In [31]:
np.corrcoef(libsize, vol_sum)[0,1]

np.float64(0.7396053761788448)

In [32]:
%%R
min_count = 2
y <- edgeR::DGEList(assay(adata, "X"), group = condition)
keep <- edgeR::filterByExpr(y, min.count = min_count, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]

In [None]:
%%R
# replace library sizes with volume
vol <- colData(adata)[["volume_sum"]]
if (is.null(vol) || any(vol <= 0)) stop("Invalid volume in colData(sce), expects 'volume_sum'.")
y$samples$lib.size <- vol

In [33]:
%%R -o s # noqa: F821
y <- edgeR::calcNormFactors(y, method="TMM")
s <- y$samples

In [34]:
s  # noqa: F821

Unnamed: 0,group,lib.size,norm.factors
donor_aging_s1_r0_0-3,WT_6,1743002.0,1.0
donor_aging_s5_r1_0-3,WT_6,783658.0,1.0
donor_aging_s5_r2_0-3,WT_12,725441.0,1.0
donor_aging_s6_r0_0-3,WT_18,842409.0,1.0
donor_aging_s7_r2_0-3,WT_6,706624.0,1.0
donor_aging_s8_r0_0-3,WT_12,1047030.0,1.0
donor_aging_s8_r1_0-3,WT_3,1413525.0,1.0
donor_aging_s8_r2_0-3,WT_18,1130092.0,1.0
donor_aging_s10_r0_0-3,WT_24,787150.0,1.0
donor_aging_s10_r1_0-3,WT_3,1179334.0,1.0


In [37]:
%%R
condition <- droplevels(factor(colData(adata)[[condition_col]]))

if (is.null(condition) || nlevels(condition) < 2) {
  message(group_i, ": skip (condition invalid or <2 levels)")
  return(NULL)
}

present <- intersect(test_groups, levels(condition))

if (!(ref_group %in% levels(condition)) || length(present) == 0) {
  message(group_i, ": skip (ref_group/test_groups missing)")
  return(NULL)
}

missing <- setdiff(test_groups, present)

message(
  group_i, ": ", paste(present, collapse = "/"), " vs ", ref_group,
  if (length(missing)) paste0(" (missing:", paste(missing, collapse = ","), ")") else ""
)

ECs: WT_6/WT_12/WT_18/WT_24 vs WT_3


In [38]:
%%R
edgeR_run_test <- function(fit, design, edger_method, test_group, ref_group){
  
  c_test <- paste0("condition", make.names(test_group))
  c_ref  <- paste0("condition", make.names(ref_group))

  if (c_test %in% colnames(design) && !(c_ref %in% colnames(design))) {
    # intercept design
    tt <- if (edger_method == "QL") edgeR::glmQLFTest(fit, coef = c_test)
          else                      edgeR::glmLRT(fit,  coef = c_test)

  } else if (all(c(c_test, c_ref) %in% colnames(design))) {
    # no-intercept design
    con <- limma::makeContrasts(contrasts = sprintf("%s-%s", c_test, c_ref), levels = design)
    tt  <- if (edger_method == "QL") edgeR::glmQLFTest(fit, contrast = con)
           else                      edgeR::glmLRT(fit,  contrast = con)

  } else {
    stop("Cannot form contrast. Needed ", c_test,
         if (c_ref %in% colnames(design)) "" else paste0(" and ", c_ref),
         ". Available columns: ", paste(colnames(design), collapse=", "))
  }

  de <- edgeR::topTags(tt, n = Inf)$table
  de$test <- paste0(test_group, "vs", ref_group)
  de
}

In [39]:
%%R
res <- list()
for (m in edger_methods) {
  o <- edgeR_fit_model(adata, edger_method=m, condition_col=condition_col, ref_group=ref_group, batch_col=batch_col)
  if (is.null(o)) next
  for (tg in present) {
    tmp <- edgeR_run_test(o$fit, o$design, m, tg, ref_group)
    if (is.null(tmp)) next
    tmp <- dplyr::mutate(as.data.frame(tmp), FC = 2^logFC, edgeR_method = m, test_group = tg, ref = ref_group)
    res[[paste(tg, m, sep="_")]] <- tmp
  }
}

  Design: ~ condition + slide


In [40]:
%%R
head(res)

$WT_6_LRT
                      logFC    logCPM           LR       PValue         FDR
Rspo1         -1.796480e+00  5.416194 1.955523e+01 9.773267e-06 0.004886634
Cdkn1a        -1.959172e+00  7.433052 1.783115e+01 2.413982e-05 0.006034954
Gata3          2.154326e+00  4.316713 1.647749e+01 4.923103e-05 0.007130481
Spp1           1.867552e+00  6.179765 1.619835e+01 5.704385e-05 0.007130481
Cx3cl1        -1.275128e+00  9.679503 1.471806e+01 1.248446e-04 0.012484458
Arpp21        -1.641594e+00  9.187297 1.418107e+01 1.660322e-04 0.013836018
Mgp            1.699510e+00  6.143109 1.381231e+01 2.020083e-04 0.014429161
Drd1          -1.820190e+00  5.677876 1.278171e+01 3.500241e-04 0.021876508
Acta2          1.827683e+00  4.902949 1.246828e+01 4.139206e-04 0.022995590
Adora2a       -1.948057e+00  6.017826 1.176045e+01 6.050245e-04 0.030251224
Lamp5         -2.010012e+00  8.140076 1.129128e+01 7.787197e-04 0.035396350
AI593442      -1.148206e+00  9.406307 8.838187e+00 2.949920e-03 0.122913344
A8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



 2.206818e-01 6.385209e-01 9.044205e-01
Ace2           0.1356245632  3.924311 2.159708e-01 6.421273e-01 9.049154e-01
Vcl            0.1221472885  8.461525 2.124945e-01 6.448193e-01 9.049154e-01
Casp3         -0.1188212077  5.551911 2.124898e-01 6.448229e-01 9.049154e-01
Col18a1        0.1399759389  6.237030 2.108403e-01 6.461096e-01 9.049154e-01
Krt12         -0.1402625235  4.190689 2.079572e-01 6.483733e-01 9.055493e-01
Clta          -0.1040309242  9.028075 1.980195e-01 6.563243e-01 9.141006e-01
Vegfb         -0.1273643176  8.429031 1.924480e-01 6.608870e-01 9.141208e-01
Atad2          0.1250539602  5.437126 1.910295e-01 6.620611e-01 9.141208e-01
Bcl2           0.1306102936  6.364155 1.892031e-01 6.635807e-01 9.141208e-01
Ccl4           0.1278565530  3.514245 1.870811e-01 6.653572e-01 9.141208e-01
Tagln          0.1585161933  4.785292 1.869350e-01 6.654800e-01 9.141208e-01
Gclm          -0.1354358308  8.017425 1.809390e-01 6.705675e-01 9.156624e-01
Fezf2         -0.1422755258  6.47422

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Slc22a2       WT_18vsWT_3 1.2494439          LRT      WT_18 WT_3
Tmem252       WT_18vsWT_3 0.7959455          LRT      WT_18 WT_3
Map1lc3b      WT_18vsWT_3 1.1879450          LRT      WT_18 WT_3
Nos2          WT_18vsWT_3 1.1622729          LRT      WT_18 WT_3
Plxnb3        WT_18vsWT_3 1.2078052          LRT      WT_18 WT_3
Hsp90aa1      WT_18vsWT_3 0.8769015          LRT      WT_18 WT_3
Epn2          WT_18vsWT_3 0.8831716          LRT      WT_18 WT_3
Ptprc         WT_18vsWT_3 1.1707960          LRT      WT_18 WT_3
Crabp2        WT_18vsWT_3 1.2217902          LRT      WT_18 WT_3
Snta1         WT_18vsWT_3 1.1815380          LRT      WT_18 WT_3
Eno2          WT_18vsWT_3 0.8642430          LRT      WT_18 WT_3
Col23a1       WT_18vsWT_3 1.1776129          LRT      WT_18 WT_3
Trim65        WT_18vsWT_3 0.8302493          LRT      WT_18 WT_3
Notch3        WT_18vsWT_3 0.8295940          LRT      WT_18 WT_3
Gad1          WT_18vsWT_3 0.8607261          LRT      WT_18 WT_3
Reln          WT_18vsWT_

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [41]:
%%R -o out # noqa: F821
if (!length(res)) return(NULL)
out <- dplyr::bind_rows(res, .id="result_id")
out$gene <- sub("[.][.][.].*", "", rownames(out)); rownames(out) <- NULL
out <- dplyr::select(out, gene, FC, dplyr::everything())

In [42]:
out  # noqa: F821

Unnamed: 0,gene,FC,result_id,logFC,logCPM,LR,PValue,FDR,test,edgeR_method,test_group,ref
1,Rspo1,0.287876,WT_6_LRT,-1.796480,5.416194,19.555233,0.000010,0.004887,WT_6vsWT_3,LRT,WT_6,WT_3
2,Cdkn1a,0.257176,WT_6_LRT,-1.959172,7.433052,17.831155,0.000024,0.006035,WT_6vsWT_3,LRT,WT_6,WT_3
3,Gata3,4.451607,WT_6_LRT,2.154326,4.316713,16.477494,0.000049,0.007130,WT_6vsWT_3,LRT,WT_6,WT_3
4,Spp1,3.649128,WT_6_LRT,1.867552,6.179765,16.198348,0.000057,0.007130,WT_6vsWT_3,LRT,WT_6,WT_3
5,Cx3cl1,0.413188,WT_6_LRT,-1.275128,9.679503,14.718062,0.000125,0.012484,WT_6vsWT_3,LRT,WT_6,WT_3
...,...,...,...,...,...,...,...,...,...,...,...,...
1996,Pdgfc,0.998429,WT_24_LRT,-0.002268,5.428233,0.000120,0.991263,0.997057,WT_24vsWT_3,LRT,WT_24,WT_3
1997,Cfp,0.998099,WT_24_LRT,-0.002745,4.526584,0.000113,0.991505,0.997057,WT_24vsWT_3,LRT,WT_24,WT_3
1998,Enpp6,1.001537,WT_24_LRT,0.002216,4.665809,0.000068,0.993435,0.997057,WT_24vsWT_3,LRT,WT_24,WT_3
1999,Pifo,1.001079,WT_24_LRT,0.001556,3.364486,0.000038,0.995063,0.997057,WT_24vsWT_3,LRT,WT_24,WT_3


In [43]:
group_i

'ECs'

In [44]:
all_degs = {}
if out is not ro.NULL:
    all_degs[group_i] = out

In [56]:
# end