In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
import os
import scipy.io as sio


In [2]:
sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=300, figsize=(5,4), format='png')


In [59]:
!ls /links/groups/treutlein/DATA/sequencing/20240606_P2808_MARINA_SALL1_microglia_coculture_d15_d30/processed

NGN2iN_TF_batch1_FB  NGN2iN_TF_batch1_GEX  NGN2iN_TF_batch1_TF


In [60]:
!ls /links/groups/treutlein/DATA/sequencing/20240621_P2825_HSIU-CHUAN

raw


In [61]:
!ls /local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed

NGN2iN_TF_batch2_FB	 NGN2iN_TF_batch2_GEX  NGN2iN_TF_batch2_TF_new
NGN2iN_TF_batch2_FB_new  NGN2iN_TF_batch2_TF   __NGN2iN_TF_batch2_FB_new.mro


In [8]:
adata1 = sc.read_10x_h5("/local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed/NGN2iN_TF_batch2_GEX/outs/raw_feature_bc_matrix.h5")


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [9]:
adata1.var_names_make_unique()

In [10]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [11]:
adata1

AnnData object with n_obs × n_vars = 2423522 × 33538
    var: 'gene_ids', 'feature_types', 'genome'

In [13]:
adata1.write_h5ad("adata/NGN2iN_TF_batch1_raw_nonfiltered.h5ad")

In [14]:
def QC_plots(adata,sample,
             x_ngenes_lowerbound = 1500, x_ngenes_upperbound = 2000,
             x_mito_lowerbound = [0.0, 0.07 ],
             x_mito_upperbound = [ 0.10, 0.3 ],      
):
    nCountsPerGene = np.sum(adata.X, axis=0)
    nCellsPerGene = np.sum(adata.X>0, axis=0)

    # Show info
    print("Number of counts (in the dataset units) per gene:", nCountsPerGene.min(), " - " ,nCountsPerGene.max())
    print("Number of cells in which each gene is detected:", nCellsPerGene.min(), " - " ,nCellsPerGene.max())

    nCells=adata.X.shape[0]

    # pySCENIC thresholds
    minCountsPerGene=3*.01*nCells # 3 counts in 1% of cells
    print("minCountsPerGene: ", minCountsPerGene)

    minSamples=.01*nCells # 1% of cells
    print("minSamples: ", minSamples)

    # simply compute the number of genes per cell (computers 'n_genes' column)
    sc.pp.filter_cells(adata, min_genes=0)
    # mito and genes/counts cuts
    mito_genes = adata.var_names.str.startswith('MT-')
    # for each cell compute fraction of counts in mito genes vs. all genes
    adata.obs['percent_mito'] = np.sum(
        adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=True)

    x = adata.obs['n_genes']
    nbins=100

    sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax3, norm_hist=True, bins=nbins)

    ax2.set_xlim(0,x_ngenes_lowerbound)
    ax3.set_xlim(x_ngenes_upperbound, adata.obs['n_genes'].max() )

    for ax in (ax1,ax2,ax3): 
        ax.set_xlabel('')

    ax1.title.set_text('n_genes')
    ax2.title.set_text('n_genes, lower bound')
    ax3.title.set_text('n_genes, upper bound')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
    fig.text(0.5, 0.0, 'Genes expressed per cell', ha='center', va='center', size='x-large')

    fig.tight_layout()
    fig.savefig(sample+'__filtering_panel_genes.pdf', dpi=600, bbox_inches='tight')


    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=True)

    x = adata.obs['percent_mito']
    nbins=100

    sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)
    sns.distplot(x, ax=ax2, norm_hist=True, bins=int(nbins/(x_mito_lowerbound[1]-x_mito_lowerbound[0])) )
    sns.distplot(x, ax=ax3, norm_hist=True, bins=int(nbins/(x_mito_upperbound[1]-x_mito_upperbound[0])) )

    ax2.set_xlim(x_mito_lowerbound[0], x_mito_lowerbound[1])
    ax3.set_xlim(x_mito_upperbound[0], x_mito_upperbound[1] )
    for ax in (ax1,ax2,ax3): 
        ax.set_xlabel('')

    ax1.title.set_text('percent_mito')
    ax2.title.set_text('percent_mito, lower bound')
    ax3.title.set_text('percent_mito, upper bound')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
    fig.text(0.5, 0.0, 'Mitochondrial read fraction per cell', ha='center', va='center', size='x-large')

    fig.tight_layout()
    fig.savefig(sample+'__filtering_panel_mitochondria.pdf', dpi=600, bbox_inches='tight')


    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=False)

    sns.distplot( adata.obs['n_genes'], ax=ax1, norm_hist=True, bins=100)
    sns.distplot( adata.obs['n_counts'], ax=ax2, norm_hist=True, bins=100)
    sns.distplot( adata.obs['percent_mito'], ax=ax3, norm_hist=True, bins=100)

    ax1.title.set_text('Number of genes expressed per cell')
    ax2.title.set_text('Counts per cell')
    ax3.title.set_text('Mitochondrial read fraction per cell')

    fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')

    fig.tight_layout()

    fig.savefig(sample+'__filtering_panel_prefilter.pdf', dpi=600, bbox_inches='tight')

    sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
        jitter=0.4, multi_panel=True,show=False)
    plt.savefig(sample+'__filtering_panel_prefilter_violin.pdf', dpi=600, bbox_inches='tight')

    sc.pl.scatter(adata, x='n_counts', y='n_genes', color='percent_mito',show=False)
    plt.savefig(sample+'__filtering_panel_prefilter_scatter.pdf', dpi=600, bbox_inches='tight')

    return(adata)

In [15]:
def filter_adata(adata,mito_pct=1,n_genes_filter_min=750,n_counts_filter_min=1000):
    from scipy.stats import median_abs_deviation

    def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
            np.median(M) + nmads * median_abs_deviation(M) < M
        )
        return outlier


    adata.obs["outlier"] = (
        is_outlier(adata, "log1p_total_counts", 5)
        | is_outlier(adata, "log1p_n_genes_by_counts", 5)
        | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
    )
    adata.obs.outlier.value_counts()

    adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
        adata.obs["pct_counts_mt"] > mito_pct
    )
    adata.obs.mt_outlier.value_counts()


    sc.pp.filter_cells(adata, min_genes=n_genes_filter_min)
    sc.pp.filter_cells(adata, min_counts=n_counts_filter_min)
    
    return(adata)


In [16]:
samples = ["NGN2iN_TF_batch1"]
for sample in samples:
    print(sample)
    sample_path = 'adata/'+sample+'_raw_nonfiltered.h5ad'
    sample_path_filtered = 'adata/'+sample+'_filtered.h5ad'
    sample_path_filtered_doublets = 'adata/'+sample+'_filtered_doublets.h5ad'
    sample_path_filtered_final = 'adata/'+sample+'_filtered_final.h5ad'

    adata = sc.read_h5ad(sample_path)
    sc.pp.filter_cells(adata, min_counts=500)

    # mitochondrial genes
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    # ribosomal genes
    adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))

    sc.pp.calculate_qc_metrics(
        adata, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
    )

    a = QC_plots(adata,sample,
                 x_ngenes_lowerbound = 1500, x_ngenes_upperbound = 2000,
             x_mito_lowerbound = [0.0, 0.07 ],x_mito_upperbound = [ 0.10, 0.3 ],)

    print(f"Total number of cells: {adata.n_obs}")
    if 'iGlut_post' in sample:
        mito_pct = 5
    if 'iGlut_pre' in sample:
        mito_pct = 6
    if 'iGABA_post' in sample:
        mito_pct = 6
    if 'iGABA_pre' in sample:
        mito_pct = 8
    else:
        mito_pct = 20

    print(mito_pct)
    adata = filter_adata(adata,mito_pct=mito_pct,n_genes_filter_min=0,n_counts_filter_min=500)
    adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

    print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

    adata.write_h5ad(sample_path_filtered)
    
    
    sc.external.pp.scrublet(adata) #estimates doublets
    adata.write_h5ad(sample_path_filtered_doublets)

    adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
    adata.write_h5ad(sample_path_filtered_final)

NGN2iN_TF_batch1
Number of counts (in the dataset units) per gene: 0.0  -  3308410.0
Number of cells in which each gene is detected: 0  -  11715
minCountsPerGene:  352.74
minSamples:  117.58



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax1, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(x, ax=ax2, norm_hist=True, bins=nbins)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with

Total number of cells: 11758
20
Number of cells after filtering of low quality cells: 8643


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 42.9%


In [17]:
adata

View of AnnData object with n_obs × n_vars = 8640 × 33538
    obs: 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes', 'percent_mito', 'outlier', 'mt_outlier', 'doublet_score', 'predicted_doublet'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'scrublet'

In [None]:
/local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed/NGN2iN_TF_batch2_GEX/outs/raw_feature_bc_matrix.h5

In [63]:
adata1_TF = sc.read_10x_h5("/local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed/NGN2iN_TF_batch2_TF_new/outs/raw_feature_bc_matrix.h5")

adata1_FB = sc.read_10x_h5("/local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed/NGN2iN_TF_batch2_FB_new/outs/raw_feature_bc_matrix.h5")


In [42]:
!ls /local1/sequencing/DATA/sequencing/20240621_P2825_HSIU-CHUAN/processed/

NGN2iN_TF_batch2_FB	 NGN2iN_TF_batch2_GEX  NGN2iN_TF_batch2_TF_new
NGN2iN_TF_batch2_FB_new  NGN2iN_TF_batch2_TF


In [64]:
FB_data_all = pd.DataFrame.sparse.from_spmatrix(adata1_FB.X)
FB_data_all.index = adata1_FB.obs_names
FB_data_all.columns = adata1_FB.var_names


In [65]:
FB_data_all.astype('bool').sum()

  FB_data_all.astype('bool').sum()


NFATC1       350
LEF1         401
LHX4          56
NEUROD1     1212
PHOX2B       764
LHX9        1580
MSX1       10146
ZFHX3         28
TLX2       10975
TCF7L2       580
TCF7L1       329
dtype: int64

In [66]:
barcode_df = pd.read_csv("/links/groups/treutlein/USERS/jjans/software/cellranger/cellranger-7.2.0/lib/python/cellranger/barcodes/translation/3M-february-2018.txt.gz",sep="\t",header=None)

In [67]:
#AAACCCAAGAAACACT
#AAACCCATCAAACACT
#--> CAAG

barcode_df[0] = barcode_df[0]+'-1'
barcode_df[1] = barcode_df[1]+'-1'
barcode_df.columns = ['GEX','FB']

In [68]:
barcode_df.index = barcode_df['FB']

In [69]:
adata1_TF.obs_names = list(barcode_df.loc[adata1_TF.obs_names,'GEX'])

In [70]:
mask_TF = [True if x in adata.obs_names else False for x in adata1_TF.obs_names]
adata1_TF_masked = adata1_TF[mask_TF,:]

In [71]:
TF_data = pd.DataFrame.sparse.from_spmatrix(adata1_TF_masked.X)
TF_data.index = adata1_TF_masked.obs_names
TF_data.columns = adata1_TF_masked.var_names
TF_data.to_csv("adata/TF_data_trans.tsv",sep="\t")

  TF_data.to_csv("adata/TF_data_trans.tsv",sep="\t")


In [72]:
adata1_FB.obs_names = list(barcode_df.loc[adata1_FB.obs_names,'GEX'])

In [73]:
mask_FB = [True if x in adata.obs_names else False for x in adata1_FB.obs_names]
adata1_FB_masked = adata1_FB[mask_FB,:]

In [74]:
adata1_FB_masked

View of AnnData object with n_obs × n_vars = 8427 × 11
    var: 'gene_ids', 'feature_types', 'genome'

In [75]:
FB_data = pd.DataFrame.sparse.from_spmatrix(adata1_FB_masked.X)
FB_data.index = adata1_FB_masked.obs_names
FB_data.columns = adata1_FB_masked.var_names
FB_data.to_csv("adata/FB_data_trans.tsv",sep="\t")

  FB_data.to_csv("adata/FB_data_trans.tsv",sep="\t")


In [76]:
FB_data.astype('bool').sum()

  FB_data.astype('bool').sum()


NFATC1      174
LEF1        150
LHX4         17
NEUROD1     457
PHOX2B      249
LHX9        567
MSX1       2755
ZFHX3        17
TLX2       2887
TCF7L2      241
TCF7L1      193
dtype: int64