In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
import os
import scipy.io as sio

sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=300, figsize=(5,4), format='png')

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [62]:
samples = ['iGABA_post','iGABA_pre','iGlut_post_p1','iGlut_post_p2','iGlut_pre']

In [64]:
def QC_plots(adata,sample):
    nCountsPerGene = np.sum(adata.X, axis=0)
    nCellsPerGene = np.sum(adata.X>0, axis=0)

    # Show info
    print("Number of counts (in the dataset units) per gene:", nCountsPerGene.min(), " - " ,nCountsPerGene.max())
    print("Number of cells in which each gene is detected:", nCellsPerGene.min(), " - " ,nCellsPerGene.max())

    nCells=adata.X.shape[0]

    # pySCENIC thresholds
    minCountsPerGene=3*.01*nCells # 3 counts in 1% of cells
    print("minCountsPerGene: ", minCountsPerGene)

    minSamples=.01*nCells # 1% of cells
    print("minSamples: ", minSamples)

    # simply compute the number of genes per cell (computers 'n_genes' column)
    sc.pp.filter_cells(adata, min_genes=0)
    # mito and genes/counts cuts
    mito_genes = adata.var_names.str.startswith('MT-')
    # for each cell compute fraction of counts in mito genes vs. all genes
    adata.obs['percent_mito'] = np.sum(
        adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=True)

    x = adata.obs['n_genes']
    x_lowerbound = 1500
    x_upperbound = 2000
    nbins=100

    sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax3, norm_hist=True, bins=nbins)

    ax2.set_xlim(0,x_lowerbound)
    ax3.set_xlim(x_upperbound, adata.obs['n_genes'].max() )

    for ax in (ax1,ax2,ax3): 
        ax.set_xlabel('')

    ax1.title.set_text('n_genes')
    ax2.title.set_text('n_genes, lower bound')
    ax3.title.set_text('n_genes, upper bound')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
    fig.text(0.5, 0.0, 'Genes expressed per cell', ha='center', va='center', size='x-large')

    fig.tight_layout()
    fig.savefig(sample+'__filtering_panel_genes.pdf', dpi=600, bbox_inches='tight')


    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=True)

    x = adata.obs['percent_mito']
    x_lowerbound = [0.0, 0.07 ]
    x_upperbound = [ 0.10, 0.3 ]
    nbins=100

    sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax2, norm_hist=True, bins=int(nbins/(x_lowerbound[1]-x_lowerbound[0])) )
    sns.distplot(x, ax=ax3, norm_hist=True, bins=int(nbins/(x_upperbound[1]-x_upperbound[0])) )

    ax2.set_xlim(x_lowerbound[0], x_lowerbound[1])
    ax3.set_xlim(x_upperbound[0], x_upperbound[1] )
    for ax in (ax1,ax2,ax3): 
        ax.set_xlabel('')

    ax1.title.set_text('percent_mito')
    ax2.title.set_text('percent_mito, lower bound')
    ax3.title.set_text('percent_mito, upper bound')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
    fig.text(0.5, 0.0, 'Mitochondrial read fraction per cell', ha='center', va='center', size='x-large')

    fig.tight_layout()
    fig.savefig(sample+'__filtering_panel_mitochondria.pdf', dpi=600, bbox_inches='tight')


    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=False)

    sns.distplot( adata.obs['n_genes'], ax=ax1, norm_hist=True, bins=100)
    sns.distplot( adata.obs['n_counts'], ax=ax2, norm_hist=True, bins=100)
    sns.distplot( adata.obs['percent_mito'], ax=ax3, norm_hist=True, bins=100)

    ax1.title.set_text('Number of genes expressed per cell')
    ax2.title.set_text('Counts per cell')
    ax3.title.set_text('Mitochondrial read fraction per cell')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')

    fig.tight_layout()

    fig.savefig(sample+'__filtering_panel_prefilter.pdf', dpi=600, bbox_inches='tight')

    sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
        jitter=0.4, multi_panel=True,show=False)
    plt.savefig(sample+'__filtering_panel_prefilter_violin.pdf', dpi=600, bbox_inches='tight')

    sc.pl.scatter(adata, x='n_counts', y='n_genes', color='percent_mito',show=False)
    plt.savefig(sample+'__filtering_panel_prefilter_scatter.pdf', dpi=600, bbox_inches='tight')

    return(adata)

In [66]:
def filter_adata(adata,mito_pct=1,n_genes_filter_min=750,n_counts_filter_min=1000):
    from scipy.stats import median_abs_deviation

    def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
            np.median(M) + nmads * median_abs_deviation(M) < M
        )
        return outlier


    adata.obs["outlier"] = (
        is_outlier(adata, "log1p_total_counts", 5)
        | is_outlier(adata, "log1p_n_genes_by_counts", 5)
        | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
    )
    adata.obs.outlier.value_counts()

    adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
        adata.obs["pct_counts_mt"] > mito_pct
    )
    adata.obs.mt_outlier.value_counts()


    sc.pp.filter_cells(adata, min_genes=n_genes_filter_min)
    sc.pp.filter_cells(adata, min_counts=n_counts_filter_min)
    
    return(adata)


In [67]:
for sample in samples:
    print(sample)
    sample_path = 'adata/'+sample+'_raw.h5ad'
    sample_path_filtered = 'adata/'+sample+'_filtered.h5ad'
    sample_path_filtered_doublets = 'adata/'+sample+'_filtered_doublets.h5ad'
    sample_path_filtered_final = 'adata/'+sample+'_filtered_final.h5ad'

    adata = sc.read_h5ad(sample_path)

    # mitochondrial genes
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    # ribosomal genes
    adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))

    sc.pp.calculate_qc_metrics(
        adata, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
    )

    a = QC_plots(adata,sample)

    print(f"Total number of cells: {adata.n_obs}")
    if 'iGlut_post' in sample:
        mito_pct = 5
    if 'iGlut_pre' in sample:
        mito_pct = 6
    if 'iGABA_post' in sample:
        mito_pct = 6
    if 'iGABA_pre' in sample:
        mito_pct = 8

    print(mito_pct)
    adata = filter_adata(adata,mito_pct=mito_pct)
    adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

    print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

    adata.write_h5ad(sample_path_filtered)
    
    
    sc.external.pp.scrublet(adata) #estimates doublets
    adata.write_h5ad(sample_path_filtered_doublets)

    adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
    adata.write_h5ad(sample_path_filtered_final)

iGABA_post
Number of counts (in the dataset units) per gene: 0.0  -  20297806.0
Number of cells in which each gene is detected: 0  -  96422
minCountsPerGene:  2892.66
minSamples:  964.22



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 96422
6
Number of cells after filtering of low quality cells: 85756


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.76
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 5.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
iGABA_pre
Number of counts (in the dataset units) per gene: 0.0  -  35597720.0
Number of cells in which each gene is detected: 0  -  149491
minCountsPerGene:  4484.73
minSamples:  1494.91



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 149491
8
Number of cells after filtering of low quality cells: 140278


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.81
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 2.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
iGlut_post_p1
Number of counts (in the dataset units) per gene: 0.0  -  16150212.0
Number of cells in which each gene is detected: 0  -  125113
minCountsPerGene:  3753.48
minSamples:  1251.16



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 125116
5
Number of cells after filtering of low quality cells: 101395


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.73
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 6.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
iGlut_post_p2
Number of counts (in the dataset units) per gene: 0.0  -  16508921.0
Number of cells in which each gene is detected: 0  -  94250
minCountsPerGene:  2827.5
minSamples:  942.5



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 94250
5
Number of cells after filtering of low quality cells: 83036


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.76
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 6.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
iGlut_pre
Number of counts (in the dataset units) per gene: 0.0  -  20297806.0
Number of cells in which each gene is detected: 0  -  96422
minCountsPerGene:  2892.66
minSamples:  964.22



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 96422
6
Number of cells after filtering of low quality cells: 85756


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.76
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 5.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
