In [1]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
from glob import iglob
import anndata
#import scrublet as scr # requires 'pip install scrublet'
import os
import sklearn
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
import scipy
import matplotlib.pyplot as plt
import pickle
np.random.seed(0)

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=100)  # low dpi (dots per inch) yields small inline figures

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.0.1
asciitree                   NA
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.4
cairo                       1.21.0
cffi                        1.15.0
cloudpickle                 2.0.0
colorama                    0.4.4
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.02.1
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
fasteners                   0.17.3
fsspec                      2022.02.0
google                      NA
h5py                        3.6.0
igraph                      0.9.11
ipykernel                   6.9.1
ipython_genutils            0.2.0
ipywid

In [2]:
id ={}
id['220615'] = 'control'
id['220623'] = 'control'
id['220628'] = 'control'

In [3]:
data1 = sc.read_10x_mtx('./220615/',var_names= 'gene_symbols',cache=True)

... writing an h5ad cache file to speedup reading next time


In [6]:
data1.obs

AAACCCAAGACCGCCT-1
AAACCCAAGCAAGTCG-1
AAACCCAAGGGAGGTG-1
AAACCCAAGGTCACTT-1
AAACCCAAGTCGCCCA-1
...
TTTGTTGGTTTGATCG-1
TTTGTTGTCGCTAGCG-1
TTTGTTGTCTGTCAGA-1
TTTGTTGTCTGTCCGT-1
TTTGTTGTCTTTCTAG-1


In [8]:
%%time
import os
holder = []
path = './00.data/'
files = os.listdir(path)
print(files)
for file in files:
    file_dir = os.path.join(path,file)
    print("开始处理：",file)
    holder.append(sc.read_10x_mtx(file_dir,var_names= 'gene_symbols',cache=True))
    holder[-1].var_names_make_unique()
     # Filer cells with less than 200 genes expressed
    sc.pp.filter_cells(holder[-1], min_genes=200)
    
    # Filer genes expressed in less than 3 cells
    sc.pp.filter_genes(holder[-1], min_cells=3)
     ##add metadata
    holder[-1].obs['ID'] = file
    ##add percent_mito
    mito_genes = [name for name in holder[-1].var_names if name.startswith('mt-')]
    holder[-1].obs['percent_mito'] = np.sum(
        holder[-1][:, mito_genes].X, axis=1).A1 / np.sum(holder[-1].X, axis=1).A1

['220615', '220623', '220628']
开始处理： 220615
... writing an h5ad cache file to speedup reading next time
filtered out 12 cells that have less than 200 genes expressed
filtered out 13102 genes that are detected in less than 3 cells
开始处理： 220623
... writing an h5ad cache file to speedup reading next time
filtered out 23 cells that have less than 200 genes expressed
filtered out 9613 genes that are detected in less than 3 cells
开始处理： 220628
... writing an h5ad cache file to speedup reading next time
filtered out 3025 cells that have less than 200 genes expressed
filtered out 15510 genes that are detected in less than 3 cells
CPU times: total: 1min 38s
Wall time: 1min 38s


In [9]:
%%time
adata = holder[0].concatenate(holder[1:],join='outer',index_unique=None)
adata.X = adata.X.tocsr()

CPU times: total: 703 ms
Wall time: 703 ms


  utils.warn_names_duplicates("obs")
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  utils.warn_names_duplicates("obs")


In [10]:
adata.obs

Unnamed: 0,n_genes,ID,percent_mito,batch
AAACCCAAGACCGCCT-1,615,220615,0.020630,0
AAACCCAAGCAAGTCG-1,1237,220615,0.014971,0
AAACCCAAGGGAGGTG-1,668,220615,0.008014,0
AAACCCAAGGTCACTT-1,1296,220615,0.017918,0
AAACCCAAGTCGCCCA-1,844,220615,0.012138,0
...,...,...,...,...
TTTGTTGGTGCCTGAC-1,265,220628,0.026866,2
TTTGTTGGTTCAAACC-1,342,220628,0.016432,2
TTTGTTGGTTTCGCTC-1,362,220628,0.021956,2
TTTGTTGTCGCTAGCG-1,287,220628,0.017241,2


In [11]:
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [12]:
adata.obs

Unnamed: 0,n_genes,ID,percent_mito,batch,n_counts,log_counts
AAACCCAAGACCGCCT-1,614,220615,0.020630,0,921.0,6.825460
AAACCCAAGCAAGTCG-1,1237,220615,0.014971,0,2271.0,7.727975
AAACCCAAGGGAGGTG-1,668,220615,0.008014,0,1123.0,7.023759
AAACCCAAGGTCACTT-1,1295,220615,0.017918,0,2623.0,7.872074
AAACCCAAGTCGCCCA-1,842,220615,0.012138,0,1483.0,7.301822
...,...,...,...,...,...,...
TTTGTTGGTGCCTGAC-1,264,220628,0.026866,2,335.0,5.814131
TTTGTTGGTTCAAACC-1,342,220628,0.016432,2,426.0,6.054440
TTTGTTGGTTTCGCTC-1,362,220628,0.021956,2,501.0,6.216606
TTTGTTGTCGCTAGCG-1,287,220628,0.017241,2,406.0,6.006353


In [13]:
adata.write('./cache/scRNA_merge.h5ad')