In [None]:
import scanpy as sc
import scipy.sparse as sp
import pandas as pd
import os
import numpy as np
import scvi
import seaborn as sns
from scipy.stats import median_abs_deviation

In [None]:
adata = sc.read_csv('../database/GSM4445608_NB01.count.csv.gz').T
adata.obs

In [None]:
adata.var['mt'] = adata.var.index.str.startswith('MT-')
adata.var

In [None]:
ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"

ribo_genes = pd.read_table(ribo_url, skiprows=2, header = None)
ribo_genes

In [None]:
adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)
adata.var

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], inplace=True, percent_top=None, log1p=False)

In [None]:
adata.var.sort_values('n_cells_by_counts')

In [None]:
sc.pp.filter_genes(adata, min_cells=3)
adata.obs.sort_values('n_genes_by_counts')

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'], 
             jitter=0.4, multi_panel=True)

In [None]:
upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
upper_lim

In [None]:
adata = adata[adata.obs.n_genes_by_counts < upper_lim]
adata

In [None]:
adata = adata[adata.obs.pct_counts_mt < 20]
adata

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "pct_counts_ribo", 5)
)
adata.obs.outlier.value_counts()

In [None]:
adata = adata[~adata.obs["outlier"]]
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4) #normalize every cell to 10,000 UMI


In [None]:
adata.X.sum(axis = 1)

In [None]:
sc.pp.log1p(adata) #change to log counts

In [None]:
adata.raw = adata

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 2000)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt', 'pct_counts_ribo'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)

In [None]:
sc.pp.neighbors(adata, n_pcs = 20)

In [None]:
sc.tl.umap(adata)
sc.pl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])