# Data Preprocessing

This notebook reads in Pancreas scRNA-seq data, splits the data by day E14.5 and E15.5 and removes redundant information

**Requires:**
- `/vol/storage/data/pancreas_sc/pancreas_2019.h5ad`

**Output:**
- `/vol/storage/data/pancreas_sc/processed/gex_e14.5.h5ad`
- `/vol/storage/data/pancreas_sc/processed/gex_e15.5.h5ad`

## Library imports

In [1]:
import sys

import scanpy as sc

sys.path.append("../../")
from paths import DATA_DIR, FIG_DIR, PROJECT_DIR  # isort: skip  # noqa: E402,F401

## General settings

In [2]:
CELLTYPES_TO_KEEP = [
    "Alpha",
    "Beta",
    "Delta",
    "Ductal",
    "Epsilon",
    "Pre-endocrine",
    "Ngn3 high EP",
    "Ngn3 low EP",
]

SC_RAW_DIR = PROJECT_DIR / "pancreas_sc" / "raw"
SC_PROCESSED_DIR = PROJECT_DIR / "pancreas_sc" / "processed"

# If Processed folder doesnt exist, create it first
SC_PROCESSED_DIR.mkdir(exist_ok=True)

## Data loading

In [3]:
# Read scRNA data
adata_rna = sc.read(SC_RAW_DIR / "pancreas_2019.h5ad")
adata_rna

AnnData object with n_obs × n_vars = 20519 × 16206
    obs: 'day', 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'proliferation', 'G2M_score', 'S_score', 'clusters_fig3_final', 'clusters_fig3_final_noep', 'clusters_fig4_final', 'clusters_fig2_final', 'clusters_fig6_broad_final', 'clusters_fig6_fine_final', 'clusters_fig6_alpha_final', 'celltype'
    var: 'n_cells', 'highly_variable_genes', 'expression_mean', 'dispersion'
    uns: 'clusters_fig6_fine_final_colors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'

## Data processing and saving


### E14.5

In [4]:
adata = adata_rna[adata_rna.obs["day"].isin(["14.5"]), :]
adata.obs = adata.obs.loc[:, ["day", "celltype"]].replace(
    {
        "14.5": "E14.5",
        "Ngn3 High early": "Ngn3 high EP",
        "Ngn3 High late": "Ngn3 high EP",
        "Fev+ Pyy": "Eps/Delta progenitors",
        "Fev+ Epsilon": "Eps/Delta progenitors",
    }
)
adata.obs["celltype_fine"] = adata.obs["celltype"].copy()
adata.obs["celltype"].replace(
    {
        "Fev+ Alpha": "Pre-endocrine",
        "Fev+ Beta": "Pre-endocrine",
        "Fev+ Delta": "Pre-endocrine",
        "Eps/Delta progenitors": "Pre-endocrine",
    },
    inplace=True,
)
adata.obs["protocol"] = "scRNA-seq"
adata.obs = adata.obs.astype({"celltype": "category", "celltype_fine": "category"})

adata = adata[adata.obs["celltype"].isin(CELLTYPES_TO_KEEP), :].copy()

adata.var = adata.var.loc[:, []]
adata.uns = {
    "celltype_colors": {
        "Alpha": "#1f78b4",
        "Beta": "#b2df8a",
        "Delta": "#6a3d9a",
        "Ductal": "#8fbc8f",
        "Epsilon": "#cab2d6",
        "Ngn3 high EP": "#fdbf6f",
        "Ngn3 low EP": "#f4a460",
        "Pre-endocrine": "#ff7f00",
    },
    "celltype_fine_colors": {
        "Alpha": "#1f78b4",
        "Beta": "#b2df8a",
        "Delta": "#6a3d9a",
        "Ductal": "#8fbc8f",
        "Eps/Delta progenitors": "#029e73",
        "Epsilon": "#cab2d6",
        "Ngn3 high EP": "#fdbf6f",
        "Ngn3 low EP": "#f4a460",
        "Fev+ Alpha": "#d55e00",
        "Fev+ Beta": "#cc78bc",
        "Fev+ Delta": "#ca9161",
    },
}

adata.obsm = {}
adata.obsp = {}
adata.varm = {}

adata.obs_names = adata.obs_names.str.replace("(-).*", "", regex=True) + "-e14.5-v2019"

adata

AnnData object with n_obs × n_vars = 5934 × 16206
    obs: 'day', 'celltype', 'celltype_fine', 'protocol'
    uns: 'celltype_colors', 'celltype_fine_colors'
    layers: 'spliced', 'unspliced'

In [5]:
adata.write(SC_PROCESSED_DIR / "gex_e14.5.h5ad")

### E15.5

In [6]:
adata = adata_rna[adata_rna.obs["day"].isin(["15.5"]), :]
adata.obs = adata.obs.loc[:, ["day", "celltype"]].replace(
    {
        "15.5": "E15.5",
        "Ngn3 High early": "Ngn3 high EP",
        "Ngn3 High late": "Ngn3 high EP",
        "Fev+ Pyy": "Eps/Delta progenitors",
        "Fev+ Epsilon": "Eps/Delta progenitors",
    }
)
adata.obs["celltype_fine"] = adata.obs["celltype"].copy()
adata.obs["celltype"].replace(
    {
        "Fev+ Alpha": "Pre-endocrine",
        "Fev+ Beta": "Pre-endocrine",
        "Fev+ Delta": "Pre-endocrine",
        "Eps/Delta progenitors": "Pre-endocrine",
    },
    inplace=True,
)
adata.obs["protocol"] = "scRNA-seq"
adata.obs = adata.obs.astype({"celltype": "category", "celltype_fine": "category"})
adata = adata[adata.obs["celltype"].isin(CELLTYPES_TO_KEEP), :].copy()

adata.var = adata.var.loc[:, []]

adata.uns = {
    "celltype_colors": {
        "Alpha": "#1f78b4",
        "Beta": "#b2df8a",
        "Delta": "#6a3d9a",
        "Ductal": "#8fbc8f",
        "Epsilon": "#cab2d6",
        "Ngn3 high EP": "#fdbf6f",
        "Ngn3 low EP": "#f4a460",
        "Pre-endocrine": "#ff7f00",
    },
    "celltype_fine_colors": {
        "Alpha": "#1f78b4",
        "Beta": "#b2df8a",
        "Delta": "#6a3d9a",
        "Ductal": "#8fbc8f",
        "Eps/Delta progenitors": "#029e73",
        "Epsilon": "#cab2d6",
        "Ngn3 high EP": "#fdbf6f",
        "Ngn3 low EP": "#f4a460",
        "Fev+ Alpha": "#d55e00",
        "Fev+ Beta": "#cc78bc",
        "Fev+ Delta": "#ca9161",
    },
}

adata.obsm = {}
adata.obsp = {}
adata.varm = {}

adata.obs_names = adata.obs_names.str.replace("(-).*", "", regex=True) + "-e15.5-v2019"

adata

AnnData object with n_obs × n_vars = 3696 × 16206
    obs: 'day', 'celltype', 'celltype_fine', 'protocol'
    uns: 'celltype_colors', 'celltype_fine_colors'
    layers: 'spliced', 'unspliced'

In [7]:
adata.write(SC_PROCESSED_DIR / "gex_e15.5.h5ad")