In [1]:
import os

import scanpy as sc

from scripts import constants

In [2]:
dataset = "mcfarland_2020"

In [3]:
adata = sc.read(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "preprocessed",
        "adata_top_2000_genes_tc.h5ad",
    ),
)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]

In [5]:
target_adata = adata[adata.obs[split_key] != background_value]
background_adata = adata[adata.obs[split_key] == background_value]

In [6]:
print(f"Num target samples: {target_adata.n_obs}")
print(f"Num background samples: {background_adata.n_obs}")

Num target samples: 3097
Num background samples: 2831


In [7]:
background_by_cell_line = (
    background_adata.obs
    .groupby("cell_line", as_index=False)
    .count()
    .sort_values(by="cell_line")
    [["cell_line", split_key]]
    .rename(columns={split_key: "background_count"})
)

In [8]:
target_by_cell_line = (
    target_adata.obs
    .groupby("cell_line", as_index=False)
    .count()
    .sort_values(by="cell_line")
    [["cell_line", split_key]]
    .rename(columns={split_key: "target_count"})
)

In [9]:
cell_line_count = background_by_cell_line.merge(target_by_cell_line, on="cell_line")

In [10]:
for _, row in cell_line_count.iterrows():
    cell_line = row["cell_line"]
    background_count = row["background_count"]
    target_count = row["target_count"]
    print(
        f"{cell_line} & {background_count} & {target_count} "
        r"\\"
    )

BICR6_UPPER_AERODIGESTIVE_TRACT & 82 & 111 \\
BICR31_UPPER_AERODIGESTIVE_TRACT & 245 & 277 \\
BT474_BREAST & 53 & 71 \\
BT549_BREAST & 100 & 131 \\
CAOV3_OVARY & 97 & 140 \\
CCFSTTG1_CENTRAL_NERVOUS_SYSTEM & 103 & 77 \\
COLO680N_OESOPHAGUS & 129 & 129 \\
COV434_OVARY & 60 & 75 \\
DKMG_CENTRAL_NERVOUS_SYSTEM & 103 & 93 \\
IALM_LUNG & 105 & 141 \\
LNCAPCLONEFGC_PROSTATE & 139 & 113 \\
LS1034_LARGE_INTESTINE & 72 & 118 \\
NCIH226_LUNG & 165 & 94 \\
NCIH2347_LUNG & 111 & 159 \\
RCC10RGB_KIDNEY & 172 & 114 \\
RCM1_LARGE_INTESTINE & 109 & 133 \\
RERFLCAD1_LUNG & 99 & 123 \\
SH10TC_STOMACH & 123 & 122 \\
SKMEL2_SKIN & 150 & 141 \\
SKMEL3_SKIN & 145 & 183 \\
SNU1079_BILIARY_TRACT & 101 & 105 \\
SQ1_LUNG & 113 & 150 \\
TEN_ENDOMETRIUM & 155 & 177 \\
UMUC1_URINARY_TRACT & 100 & 120 \\
