# Preprocess the data

1. QC
2. Remove low quality cells
3. Remove doublets
4. Call major compartments
5. Link with InferCNV results
6. Split into malignant Cells

## Remove Doublets

In [None]:
# PYTHON
import scanpy as sc
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Setup directories
ADATA_IN_PATH = os.path.join(dataDir, 'primary.h5ad')
ADATA_OUT_PATH = os.path.join(dataDir, 'primary_doublet.h5ad')
DOUBLET_HIST_PATH = os.path.join(figuresDir, 'doublet_score_hist.png'

adata = sc.read_h5ad(ADATA_PATH)
assert adata.X.max() > 100, "Seems like the data is not raw counts..."
adata.raw = adata.copy()
sc.pp.scrublet(adata, batch_key="sample")
sc.pl.umap(adata, color=["sample", 'group', "doublet_score", "predicted_doublet"], save="_scrublet.png", show=False, cmap='viridis', ncols=1)
# Plot histogram of the doublet_scores
plt.clf()
sns.histplot(adata.obs['doublet_score'], bins=100)
plt.savefig(DOUBLET_HIST_PATH), dpi=300, bbox_inches='tight')
plt.close()

# Drop doublet
adata = adata[~adata.obs['predicted_doublet']].copy()
adata.write(ADATA_OUT_PATH)

##  Double checking doublet using scDblFinder in R.

We found similar results, so will keep the scrublet results.

In [None]:
## R
# First read the data to form an sce object
library(scDblFinder)
library(Matrix)
library(dplyr)
library(tibble)

setwd("/data1/shahs3/users/salehis/sclc/results/patient_met/primary_umaps/data")

# Read the counts matrix (.npz numpy)

# Should be gene by cell, so transpose
counts <- readMM("counts.mtx")
counts <- t(counts)
# Read the cell and gene names
cell_names <- readLines("Ru1449A_cells.txt")
gene_names <- readLines("Ru1449A_genes.txt")

# Create the sce object
sce <- SingleCellExperiment(list(counts=counts), colData=DataFrame(label=cell_names), rowData=DataFrame(gene_id=gene_names))

# Run the scDblFinder
sce <- scDblFinder(sce)

# Examine the results
sce$scDblFinder.score
sce$scDblFinder.class %>% table()

# Extract a table of cell_name, score, and class
res <- data.frame(cell_name=sce$label, score=sce$scDblFinder.score, class=sce$scDblFinder.class) %>% as_tibble()
res <- res %>% arrange(desc(score))
# Save this as a csv
write.csv(res, "dblFinder_default.csv", row.names=FALSE)

# Rerun with higher threshold
sce <- scDblFinder(sce, dbr=0.1)
# create the table
res <- data.frame(cell_name=sce$label, score=sce$scDblFinder.score, class=sce$scDblFinder.class) %>% as_tibble()
res <- res %>% arrange(desc(score))
# save this as a csv
write.csv(res, "dblFinder_dbr_0.1.csv", row.names=FALSE)