## Marker-based metrics
- MECR (mutually-exclusive co-expression rate)

TODO:
- implement positive marker F1 score 

In [5]:
import os

import pandas as pd

from cellseg_benchmark._constants import BASE_PATH


def load_marker_gene_dict():
    # load marker genes
    # from Allen mouse brain scRNA-seq atlas (Yao 2023 Nature, 4M cells)
    # computed using edgeR of pseudobulks, using author-derived cell annotations (see separate script)
    ABCAtlas_marker_df = pd.read_csv(
        os.path.join(
            BASE_PATH,
            "misc",
            "scRNAseq_ref_ABCAtlas_Yao2023Nature",
            "marker_genes_df",
            "20250211_cell_type_markers_top15_specific.csv",
        )
    )
    # turn ABCAtlas_marker_df to dict
    cell_types = ABCAtlas_marker_df.columns.tolist()
    cell_type_dict = {}
    for cell_type in cell_types:
        # Skip the index column (0) if present
        if cell_type == "0":
            continue
        # Get values from column, excluding the header row
        genes = ABCAtlas_marker_df[cell_type].iloc[0:].tolist()
        # Remove any NaN values
        genes = [gene for gene in genes if pd.notna(gene)]
        cell_type_dict[cell_type] = genes
    return cell_type_dict


marker_gene_dict = load_marker_gene_dict()

In [4]:
marker_gene_dict

{'Ependymal': ['Ttr',
  'Ccdc153',
  'Dnah12',
  'Ecrg4',
  'Dnah6',
  'Gm19935',
  'Kl',
  'Rassf9',
  'Foxj1',
  'Spag16',
  'Ak7',
  'Ankub1',
  'Cfap299',
  'Ttll8',
  'Spag6l'],
 'BAMs': ['Cd163',
  'Itga4',
  'Dclre1c',
  'Snx8',
  'Anapc15',
  'Rnf150',
  'Nfxl1',
  'Gab3',
  'Pdgfc',
  'Ninj1',
  'Gm30382',
  'Clcn5',
  'Wwp1',
  'B3galnt1',
  'Snx2'],
 'Microglia': ['Gm2629',
  'Cd33',
  'Pld4',
  'Hck',
  'Golm1',
  'AU020206',
  'E230029C05Rik',
  'Ebf3',
  'Zeb2os',
  'Pag1',
  'Tex14',
  'Trim30a',
  'Rnf19b',
  'Fam102b',
  'Tmem144'],
 'ECs': ['Adgrl4',
  'Cldn5',
  'Ly6a',
  'Cyyr1',
  'Abcb1a',
  'Ptprb',
  'Tek',
  'Abcc4',
  'Slc7a5',
  'Ddc',
  'Slc7a1',
  'Bsg',
  'Slc6a6',
  'Spock2',
  'Eogt'],
 'VLMCs': ['Foxd1',
  'Spp1',
  'C1qtnf7',
  'Foxq1',
  'Eln',
  'Ppara',
  'Heyl',
  'Cnn2',
  'Mrc2',
  'Kcnj8',
  'Ston1',
  'Ndnf',
  'Tmem132c',
  'B130024G19Rik',
  'Gper1'],
 'OPCs': ['Neu4',
  'C1ql1',
  'Gal3st1',
  'Dpyd',
  'Chst11',
  'Sgcd',
  '2810468N07Rik',

In [37]:
# from https://elihei2.github.io/segger_dev/api/validation/#segger.validation.compute_MECR
from typing import Dict, List, Tuple

import anndata as ad


def compute_MECR(
    adata: ad.AnnData, gene_pairs: List[Tuple[str, str]], layer=None
) -> Dict[Tuple[str, str], float]:
    """Compute the Mutually Exclusive Co-expression Rate (MECR) for each gene pair in an AnnData object.

    Args:
    - adata: AnnData
        Annotated data object containing gene expression data.
    - gene_pairs: List[Tuple[str, str]]
        List of tuples representing gene pairs to evaluate.

    Returns:
    - mecr_dict: Dict[Tuple[str, str], float]
        Dictionary where keys are gene pairs (tuples) and values are MECR values.
    """
    mecr_dict = {}
    gene_expression = adata.to_df(layer=layer)
    for gene1, gene2 in gene_pairs:
        expr_gene1 = gene_expression[gene1] > 0
        expr_gene2 = gene_expression[gene2] > 0
        both_expressed = (expr_gene1 & expr_gene2).mean()
        at_least_one_expressed = (expr_gene1 | expr_gene2).mean()
        mecr = (
            both_expressed / at_least_one_expressed if at_least_one_expressed > 0 else 0
        )
        mecr_dict[(gene1, gene2)] = mecr
    return mecr_dict

In [3]:
from pathlib import Path

import scanpy as sc

cohort = "aging"
method = "Cellpose_1_DAPI_PolyT"
adata_name = "adata_integrated"  # this should not be a parameter but fixed
base_path = BASE_PATH
data_path = Path(base_path) / "analysis" / cohort / method
adata_path = data_path / "adatas" / f"{adata_name}.h5ad.gz"

adata = sc.read_h5ad(adata_path)

In [7]:
marker_gene_dict = load_marker_gene_dict()
# process marker gene dict to gene-pair list
# get gene-pair list for MECR
# all with all others
# exclude those not in adata.var_names
gene_pairs = []
cell_types = list(marker_gene_dict.keys())
for i, cell_type1 in enumerate(cell_types):
    for cell_type2 in cell_types[i + 1 :]:
        for gene1 in marker_gene_dict[cell_type1]:
            for gene2 in marker_gene_dict[cell_type2]:
                if gene1 in adata.var_names and gene2 in adata.var_names:
                    gene_pairs.append((gene1, gene2))

In [38]:
print(f"computing MECR score on {len(gene_pairs)} gene_pairs")
results = compute_MECR(adata, gene_pairs, layer="volume_log1p_norm")

computing MECR score on 286 gene_pairs


In [36]:
adata.to_df(layer="volume_log1p_norm")

Unnamed: 0,A830036E02Rik,AI593442,Abca1,Abcb1a,Abcb1b,Abcc9,Abcg2,Acan,Ace2,Acer2,...,Vtn,Vwf,Wbp2,Wdr12,Wnt2b,Wnt7a,Xirp1,Zcchc14,Zeb1,Zic3
aaaaaaac-1,0.000000,1.566087,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,1.062619,0.000000,1.566087,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
aaaaaaag-1,0.000000,0.000000,0.000000,0.887303,0.0,1.349927,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.887303,0.0
aaaaaaah-1,0.000000,0.000000,1.245666,1.321829,0.0,0.000000,0.601625,0.000000,0.0,0.000000,...,0.242968,0.601625,0.741990,0.000000,0.0,0.242968,0.0,0.000000,0.000000,0.0
aaaaaaai-1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,2.971364,0.0
aaaaaaaj-1,0.976682,0.976682,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,1.786226,0.000000,0.0,0.000000,0.0,0.976682,0.976682,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
aaaalkjp-1-8,0.000000,0.000000,0.838258,0.000000,0.0,0.000000,0.000000,0.504507,0.0,0.000000,...,0.000000,0.000000,0.838258,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
aaaalkka-1-8,0.448276,1.480164,0.448276,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.756695,0.448276,0.0,0.000000,0.0,0.000000,0.000000,0.0
aaaalkkb-1-8,0.000000,1.850332,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,1.470453,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0
aaaalkkc-1-8,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.669657,...,0.000000,0.000000,1.149043,0.000000,0.0,0.000000,0.0,0.000000,1.149043,0.0


In [35]:
adata.to_df()

Unnamed: 0,A830036E02Rik,AI593442,Abca1,Abcb1a,Abcb1b,Abcc9,Abcg2,Acan,Ace2,Acer2,...,Vtn,Vwf,Wbp2,Wdr12,Wnt2b,Wnt7a,Xirp1,Zcchc14,Zeb1,Zic3
aaaaaaac-1,-0.470443,1.096362,-0.631811,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,-0.339745,...,3.499279,-0.290948,1.993809,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,-1.000600,-0.290458
aaaaaaag-1,-0.470443,-1.007325,-0.631811,1.660135,-0.202801,5.696623,-0.410346,-0.253329,-0.19412,-0.339745,...,-0.205790,-0.290948,-1.153786,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,0.882898,-0.290458
aaaaaaah-1,-0.470443,-1.007325,2.382310,2.664147,-0.202801,-0.189414,1.956154,-0.253329,-0.19412,-0.339745,...,0.641374,2.497203,0.337499,-0.536803,-0.315177,0.878519,-0.226188,-0.757540,-1.000600,-0.290458
aaaaaaai-1,-0.470443,-1.007325,-0.631811,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,-0.339745,...,-0.205790,-0.290948,-1.153786,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,5.306779,-0.290458
aaaaaaaj-1,1.957401,0.304628,-0.631811,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,-0.339745,...,-0.205790,-0.290948,2.436253,-0.536803,-0.315177,-0.339384,-0.226188,1.797127,1.072624,-0.290458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
aaaalkjp-1-8,-0.470443,-1.007325,1.396512,-0.390061,-0.202801,-0.189414,-0.410346,3.428933,-0.19412,-0.339745,...,-0.205790,-0.290948,0.530984,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,-1.000600,-0.290458
aaaalkka-1-8,0.643886,0.980943,0.452877,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,-0.339745,...,-0.205790,-0.290948,0.367055,1.282058,-0.315177,-0.339384,-0.226188,-0.757540,-1.000600,-0.290458
aaaalkkb-1-8,-0.470443,1.478180,-0.631811,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,-0.339745,...,-0.205790,-0.290948,1.801599,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,-1.000600,-0.290458
aaaalkkc-1-8,-0.470443,-1.007325,-0.631811,-0.390061,-0.202801,-0.189414,-0.410346,-0.253329,-0.19412,7.909292,...,-0.205790,-0.290948,1.155614,-0.536803,-0.315177,-0.339384,-0.226188,-0.757540,1.438499,-0.290458


In [39]:
results

{('Ccdc153', 'Cd163'): np.float64(0.042996499536703384),
 ('Ccdc153', 'Itga4'): np.float64(0.04649340472770014),
 ('Ccdc153', 'Pdgfc'): np.float64(0.05154596446630378),
 ('Foxj1', 'Cd163'): np.float64(0.07860431450680158),
 ('Foxj1', 'Itga4'): np.float64(0.10318897533257217),
 ('Foxj1', 'Pdgfc'): np.float64(0.12961382379892655),
 ('Ccdc153', 'Cldn5'): np.float64(0.033499063391747345),
 ('Ccdc153', 'Ly6a'): np.float64(0.03579789052842),
 ('Ccdc153', 'Abcb1a'): np.float64(0.03972710673103379),
 ('Ccdc153', 'Tek'): np.float64(0.039851628802853034),
 ('Ccdc153', 'Slc7a5'): np.float64(0.04510062784574647),
 ('Foxj1', 'Cldn5'): np.float64(0.06123534431365215),
 ('Foxj1', 'Ly6a'): np.float64(0.07545382420158912),
 ('Foxj1', 'Abcb1a'): np.float64(0.09305456448313591),
 ('Foxj1', 'Tek'): np.float64(0.0773974022384966),
 ('Foxj1', 'Slc7a5'): np.float64(0.13380918770068895),
 ('Ccdc153', 'Spp1'): np.float64(0.03967532921338219),
 ('Ccdc153', 'Foxq1'): np.float64(0.02922910489622566),
 ('Ccdc153',

In [29]:
adata.layers["counts"].todense().max(axis=0)

matrix([[  64,  180,   28,   54,    4,   44,   50,   34,    7,   10,
           22,   75,   56,   60,    8,  122,    9,   13,    9,   72,
            4,    6,   35,   46,    4,   15,    4,   24,   93,  163,
          144,  164,   23,   47,   47,  263,   13,   51,   15,   22,
           58,   22,   12,   48,   36,   29,    8,    7,    9,   11,
           23,   37,   49,   12,   26,   24,   37,   57,   14,    9,
           12,   10,   12,   23,   16,   81,   75,   18,  230,   60,
           10,   31,   13,   11,    8,   10,  194,   19,   25,   30,
          111,   16,   53,   33,   66,   47,   61,    9,   24,  154,
           56,   32,  117,   52,   49,  144,   89,    6,  585,   21,
           20,   66,   76,   24,   49,    8,   13,   13,   16,   18,
           80,   27,   50,   15,   32,    5,  237,    8,   17,  102,
           13,  120,   98,  137,   81,   70,   15,   17,   15,   42,
          133,    6,   11,    7,    5,   28,  516,   59,  138,   87,
           12,   65,    7,  131,  

In [10]:
results

{('Ccdc153', 'Cd163'): np.float64(0.042996499536703384),
 ('Ccdc153', 'Itga4'): np.float64(0.04640084814069841),
 ('Ccdc153', 'Pdgfc'): np.float64(0.0515093518320194),
 ('Foxj1', 'Cd163'): np.float64(0.07854813554515809),
 ('Foxj1', 'Itga4'): np.float64(0.10283476948904025),
 ('Foxj1', 'Pdgfc'): np.float64(0.12945526147658598),
 ('Ccdc153', 'Cldn5'): np.float64(0.03259065710956534),
 ('Ccdc153', 'Ly6a'): np.float64(0.035731835731835734),
 ('Ccdc153', 'Abcb1a'): np.float64(0.036802080202793644),
 ('Ccdc153', 'Tek'): np.float64(0.03959511508541858),
 ('Ccdc153', 'Slc7a5'): np.float64(0.04145553909843229),
 ('Foxj1', 'Cldn5'): np.float64(0.05861994122167406),
 ('Foxj1', 'Ly6a'): np.float64(0.07511342233383479),
 ('Foxj1', 'Abcb1a'): np.float64(0.08258905731534404),
 ('Foxj1', 'Tek'): np.float64(0.0766903514686306),
 ('Foxj1', 'Slc7a5'): np.float64(0.11691369800814481),
 ('Ccdc153', 'Spp1'): np.float64(0.03966534548685643),
 ('Ccdc153', 'Foxq1'): np.float64(0.02922910489622566),
 ('Ccdc153

mecr_scores = {}
for key, adata in processed_adata_dict.items():
    # get gene-pair list for MECR
    # all with all others
    # exclude those not in adata.var_names
    gene_pairs = []
    cell_types = list(cell_type_dict.keys())

    for i, cell_type1 in enumerate(cell_types):
        for cell_type2 in cell_types[i + 1 :]:
            for gene1 in cell_type_dict[cell_type1]:
                for gene2 in cell_type_dict[cell_type2]:
                    if gene1 in adata.var_names and gene2 in adata.var_names:
                        gene_pairs.append((gene1, gene2))

    mecr_scores[key] = compute_MECR(adata, gene_pairs)
# Convert nested dictionary to DataFrame
data = []
for dataset, scores in mecr_scores.items():
    for gene_pair, score in scores.items():
        data.append(
            {
                "dataset": dataset,
                "gene_pair": f"{gene_pair[0]}-{gene_pair[1]}",
                "MECR": float(score),
            }
        )
mecr_df = pd.DataFrame(data)
mecr_df["dataset"] = mecr_df["dataset"].str.replace("adata_", "")