In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
from glob import iglob
import anndata
#import scrublet as scr # requires 'pip install scrublet'
import os
import sklearn
from sklearn.linear_model import LogisticRegression
import matplotlib as mpl
import scipy
import matplotlib.pyplot as plt
import pickle

In [2]:
np.random.seed(0)

In [3]:
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.0.1
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.4
cffi                        1.15.0
cloudpickle                 2.0.0
colorama                    0.4.4
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.02.1
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
executing                   0.8.3
fsspec                      2022.02.0
google                      NA
h5py                        3.6.0
ipykernel                   6.9.1
ipython_genutils            0.2.0
ipywidgets                  7.6.5
jedi                        0.18.1
jinja2                      2.11.3
joblib                      1.1.0
jupyter_server              1.13.5
k

# Load in the data

In [4]:
Tissue ={}
Tissue['GSM5004180_PT1'] = 'Gastric'
Tissue['GSM5004181_PT2'] = 'Gastric'
Tissue['GSM5004182_PT3'] = 'Gastric'

Tissue['GSM5004183_NT1'] = 'Normal'
Tissue['GSM5004184_LN1'] = 'Lymph node'
Tissue['GSM5004185_LN2'] = 'Lymph node'
Tissue['GSM5004187_P1'] = 'Peritoneum'
Tissue['GSM5004188_Li1'] = 'Liver'
Tissue['GSM5004189_Li2'] = 'Liver'

metastasis_status = {}
metastasis_status['GSM5004180_PT1'] = 'Primary'
metastasis_status['GSM5004181_PT2'] = 'Primary'
metastasis_status['GSM5004182_PT3'] = 'Primary'

metastasis_status['GSM5004183_NT1'] = 'Normal'
metastasis_status['GSM5004184_LN1'] = 'Metastasis'
metastasis_status['GSM5004185_LN2'] = 'Metastasis'
metastasis_status['GSM5004187_P1'] = 'Metastasis'
metastasis_status['GSM5004188_Li1'] = 'Metastasis'
metastasis_status['GSM5004189_Li2'] = 'Metastasis'


In [5]:
%%time
import os
holder = []
path = './00.data/'
files = os.listdir(path)
for file in files:
    file_dir = os.path.join(path,file)
    print("开始处理：",file)
    holder.append(sc.read_10x_mtx(file_dir,var_names= 'gene_symbols',cache=True))
    holder[-1].var_names_make_unique()
     # Filer cells with less than 200 genes expressed
    sc.pp.filter_cells(holder[-1], min_genes=200)
    
    # Filer genes expressed in less than 3 cells
    sc.pp.filter_genes(holder[-1], min_cells=3)
    ##add metadata
    holder[-1].obs['ID'] = file.split('_')[0]
    holder[-1].obs['Sample'] = file.split('_')[1]
    info = Tissue[file]
    holder[-1].obs['Tissue'] = info
    stau = metastasis_status[file]
    holder[-1].obs['metastasis_status'] = stau
    ##add percent_mito
    mito_genes = [name for name in holder[-1].var_names if name.startswith('MT-')]
    holder[-1].obs['percent_mito'] = np.sum(
        holder[-1][:, mito_genes].X, axis=1).A1 / np.sum(holder[-1].X, axis=1).A1


开始处理： GSM5004180_PT1
... reading from cache file cache\00.data-GSM5004180_PT1-matrix.h5ad
filtered out 26 cells that have less than 200 genes expressed
filtered out 12306 genes that are detected in less than 3 cells
开始处理： GSM5004181_PT2
... reading from cache file cache\00.data-GSM5004181_PT2-matrix.h5ad
filtered out 54 cells that have less than 200 genes expressed
filtered out 11264 genes that are detected in less than 3 cells
开始处理： GSM5004182_PT3
... reading from cache file cache\00.data-GSM5004182_PT3-matrix.h5ad
filtered out 208 cells that have less than 200 genes expressed
filtered out 13050 genes that are detected in less than 3 cells
开始处理： GSM5004183_NT1
... reading from cache file cache\00.data-GSM5004183_NT1-matrix.h5ad
filtered out 145 cells that have less than 200 genes expressed
filtered out 14364 genes that are detected in less than 3 cells
开始处理： GSM5004184_LN1
... reading from cache file cache\00.data-GSM5004184_LN1-matrix.h5ad
filtered out 37 cells that have less than 20

In [6]:
%%time
adata = holder[0].concatenate(holder[1:],join='outer',index_unique=None)
adata.X = adata.X.tocsr()

  utils.warn_names_duplicates("obs")
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  utils.warn_names_duplicates("obs")


CPU times: total: 1.14 s
Wall time: 1.17 s


In [7]:
adata

AnnData object with n_obs × n_vars = 48020 × 25444
    obs: 'n_genes', 'ID', 'Sample', 'Tissue', 'metastasis_status', 'percent_mito', 'batch'
    var: 'gene_ids-0', 'feature_types-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'n_cells-7', 'gene_ids-8', 'feature_types-8', 'n_cells-8'

In [8]:
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [9]:
adata.obs

Unnamed: 0,n_genes,ID,Sample,Tissue,metastasis_status,percent_mito,batch,n_counts,log_counts
AAACCCACAACAACAA-1,352,GSM5004180,PT1,Gastric,Primary,0.008205,0,975.0,6.882438
AAACCCAGTCAAAGTA-1,4415,GSM5004180,PT1,Gastric,Primary,0.076083,0,16022.0,9.681718
AAACCCAGTGGATACG-1,492,GSM5004180,PT1,Gastric,Primary,0.009602,0,1458.0,7.284821
AAACCCATCGTTCCCA-1,323,GSM5004180,PT1,Gastric,Primary,0.009579,0,522.0,6.257668
AAACGAAAGCTGTTCA-1,771,GSM5004180,PT1,Gastric,Primary,0.005300,0,2264.0,7.724888
...,...,...,...,...,...,...,...,...,...
TTTGTTGTCGCGCTGA-1,1068,GSM5004189,Li2,Liver,Metastasis,0.086989,8,2667.0,7.888710
TTTGTTGTCGGACTGC-1,316,GSM5004189,Li2,Liver,Metastasis,0.310109,8,732.0,6.595780
TTTGTTGTCGTAGGAG-1,1034,GSM5004189,Li2,Liver,Metastasis,0.043174,8,2432.0,7.796469
TTTGTTGTCTAGAACC-1,1186,GSM5004189,Li2,Liver,Metastasis,0.054685,8,2615.0,7.869020


In [10]:
adata.var

Unnamed: 0,gene_ids-0,feature_types-0,n_cells-0,gene_ids-1,feature_types-1,n_cells-1,gene_ids-2,feature_types-2,n_cells-2,gene_ids-3,...,n_cells-5,gene_ids-6,feature_types-6,n_cells-6,gene_ids-7,feature_types-7,n_cells-7,gene_ids-8,feature_types-8,n_cells-8
A1BG,ENSG00000121410,Gene Expression,224.0,ENSG00000121410,Gene Expression,384.0,ENSG00000121410,Gene Expression,379.0,ENSG00000121410,...,1572.0,ENSG00000121410,Gene Expression,826.0,ENSG00000121410,Gene Expression,120.0,ENSG00000121410,Gene Expression,1147.0
A1BG-AS1,ENSG00000268895,Gene Expression,34.0,ENSG00000268895,Gene Expression,70.0,ENSG00000268895,Gene Expression,41.0,ENSG00000268895,...,263.0,ENSG00000268895,Gene Expression,127.0,ENSG00000268895,Gene Expression,36.0,ENSG00000268895,Gene Expression,168.0
A1CF,ENSG00000148584,Gene Expression,56.0,ENSG00000148584,Gene Expression,11.0,ENSG00000148584,Gene Expression,24.0,ENSG00000148584,...,,ENSG00000148584,Gene Expression,61.0,ENSG00000148584,Gene Expression,15.0,ENSG00000148584,Gene Expression,9.0
A2M,ENSG00000175899,Gene Expression,91.0,ENSG00000175899,Gene Expression,1059.0,ENSG00000175899,Gene Expression,90.0,ENSG00000175899,...,69.0,ENSG00000175899,Gene Expression,590.0,ENSG00000175899,Gene Expression,233.0,ENSG00000175899,Gene Expression,829.0
A2M-AS1,ENSG00000245105,Gene Expression,33.0,ENSG00000245105,Gene Expression,103.0,ENSG00000245105,Gene Expression,46.0,ENSG00000245105,...,67.0,ENSG00000245105,Gene Expression,85.0,ENSG00000245105,Gene Expression,214.0,ENSG00000245105,Gene Expression,1707.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,205.0,ENSG00000070476,Gene Expression,779.0,ENSG00000070476,Gene Expression,215.0,ENSG00000070476,...,463.0,ENSG00000070476,Gene Expression,508.0,ENSG00000070476,Gene Expression,125.0,ENSG00000070476,Gene Expression,614.0
ZYG11A,ENSG00000203995,Gene Expression,43.0,ENSG00000203995,Gene Expression,9.0,ENSG00000203995,Gene Expression,12.0,ENSG00000203995,...,17.0,ENSG00000203995,Gene Expression,27.0,,,,ENSG00000203995,Gene Expression,11.0
ZYG11B,ENSG00000162378,Gene Expression,408.0,ENSG00000162378,Gene Expression,1766.0,ENSG00000162378,Gene Expression,378.0,ENSG00000162378,...,492.0,ENSG00000162378,Gene Expression,786.0,ENSG00000162378,Gene Expression,105.0,ENSG00000162378,Gene Expression,697.0
ZYX,ENSG00000159840,Gene Expression,1230.0,ENSG00000159840,Gene Expression,3189.0,ENSG00000159840,Gene Expression,596.0,ENSG00000159840,...,1075.0,ENSG00000159840,Gene Expression,1303.0,ENSG00000159840,Gene Expression,254.0,ENSG00000159840,Gene Expression,1936.0


# 保存scanpy.h5ad用于后续使用

In [11]:
adata.write('./cache/scRNA_merge.h5ad')