In [96]:
import scanpy as sc

In [97]:
adata_wt = sc.read_csv("../data/GSM5359947_D20-3958_WT.csv.gz").T
adata_wt

AnnData object with n_obs × n_vars = 2735 × 31053

In [98]:
adata_ko = sc.read_csv("../data/GSM5359948_D20-3959_KO.csv.gz").T
adata_ko

AnnData object with n_obs × n_vars = 1785 × 31053

In [99]:
adata_wt.obs["treatment"] = "WT"
adata_ko.obs["treatment"] = "KO"

In [100]:
# Labeling mitochondrial genes
adata_wt.var["mt"] = adata_wt.var_names.str.startswith("mt-")
adata_ko.var["mt"] = adata_ko.var_names.str.startswith("mt-")

In [101]:
# calculating metrics
sc.pp.calculate_qc_metrics(adata_wt, qc_vars=["mt"], inplace = True, percent_top=None, log1p=False)
sc.pp.calculate_qc_metrics(adata_ko, qc_vars=["mt"], inplace = True, percent_top=None, log1p=False)

In [113]:
# visualizing mitochondrial percent
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata_wt, keys = ["pct_counts_mt", "n_genes_by_counts", "total_counts"], multi_panel = True)
sc.pl.violin(adata_ko, keys = ["pct_counts_mt", "n_genes_by_counts", "total_counts"], multi_panel = True)

# Preprocessing

In [103]:
# filtering out cells with less than 500 genes expressed
sc.pp.filter_cells(adata_wt, min_genes = 500)
sc.pp.filter_cells(adata_ko, min_genes = 500)

In [104]:
import doubletdetection

clf = doubletdetection.BoostClassifier(
    n_iters=25, clustering_algorithm="louvain", standard_scaling=True, pseudocount=0.1, n_jobs=-1
)

# doublet identification for wildtype data
doublets = clf.fit(adata_wt.X).predict(p_thresh=1e-7, voter_thresh=0.5)
doublet_score = clf.doublet_score()

adata_wt.obs["doublet"] = doublets

  0%|          | 0/25 [00:00<?, ?it/s]

In [105]:
# doublet identification for knockout data
doublets = clf.fit(adata_ko.X).predict(p_thresh=1e-7, voter_thresh=0.5)
doublet_score = clf.doublet_score()

adata_ko.obs["doublet"] = doublets

  0%|          | 0/25 [00:00<?, ?it/s]

In [111]:
# removing doublets from data
adata_wt = adata_wt[adata_wt.obs.doublet == 0.0]
adata_ko = adata_ko[adata_ko.obs.doublet == 0.0]

In [None]:
# removing high mitochondrial percentage


In [15]:
adata = sc.concat([adata_wt, adata_ko])

  utils.warn_names_duplicates("obs")


In [22]:
adata

AnnData object with n_obs × n_vars = 4520 × 31053
    obs: 'treatment'

4519