In [1]:
import scanpy as sc
import anndata
from anndata import AnnData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
from collections import Counter
import seaborn as sns
sc.settings.verbosity = 0 
sc.settings.set_figure_params(dpi=150, color_map='viridis')  
sc.logging.print_versions()

## Starting from 10X Dataset

In [2]:
raw_adata1 = sc.read_10x_mtx('oskar_sample1',var_names='gene_symbols',cache=True)

In [3]:
raw_adata2 = sc.read_10x_mtx('oskar_sample2',var_names='gene_symbols',cache=True)

In [4]:
raw_adata3 = sc.read_10x_mtx('salem_sample1',var_names='gene_symbols',cache=True)

In [5]:
raw_adata4 = sc.read_10x_mtx('salem_sample2',var_names='gene_symbols',cache=True)

In [6]:
raw_adata5 = sc.read_10x_mtx('memphis',var_names='gene_symbols',cache=True)

In [7]:
raw_adata6 = sc.read_10x_mtx('london',var_names='gene_symbols',cache=True)

In [8]:
raw_adata7 = sc.read_10x_mtx('Nairobi_1SN',var_names='gene_symbols',cache=True)

In [9]:
raw_adata8 = sc.read_10x_mtx('Nairobi_2SN',var_names='gene_symbols',cache=True)

In [10]:
#join data together
raw_adata_ori = raw_adata1.concatenate(raw_adata2, raw_adata3,raw_adata4,raw_adata5,raw_adata6,raw_adata7,raw_adata8)
raw_adata_ori

AnnData object with n_obs × n_vars = 118240 × 54894
    obs: 'batch'
    var: 'gene_ids'

In [11]:
#give cells (number ids)
raw_adata_ori.obs['cell_id'] = range(raw_adata_ori.obs.shape[0])
raw_adata_ori

AnnData object with n_obs × n_vars = 118240 × 54894
    obs: 'batch', 'cell_id'
    var: 'gene_ids'

In [12]:
#give another colum 
sample_ids = list(map(int,raw_adata_ori.obs['batch']))
sample_names = ['1','2','3','4','5','6','7','8']

sample_names = pd.Series([sample_names[sample_id] for sample_id in sample_ids])
raw_adata_ori.obs['sample_names'] = list(sample_names)
raw_adata_ori

AnnData object with n_obs × n_vars = 118240 × 54894
    obs: 'batch', 'cell_id', 'sample_names'
    var: 'gene_ids'

In [13]:
#give another colum 
sample_ids = list(map(int,raw_adata_ori.obs['batch']))
region_names = ['1','1','2','2','3','4','5','5']
region_names = pd.Series([region_names[sample_id] for sample_id in sample_ids])
raw_adata_ori.obs['monkey'] = list(region_names)
raw_adata_ori

AnnData object with n_obs × n_vars = 118240 × 54894
    obs: 'batch', 'cell_id', 'sample_names', 'monkey'
    var: 'gene_ids'

In [14]:
AnnData.write(raw_adata_ori, filename='./processed_data/raw_adata_BeforeDB.h5ad', compression=None, compression_opts=None, force_dense=None)

... storing 'sample_names' as categorical
... storing 'monkey' as categorical


In [15]:
raw_adata_ori = anndata.read_h5ad('./processed_data/raw_adata_BeforeDB.h5ad')

Doublet removal by doubledetection

In [16]:
import doubletdetection
clf = doubletdetection.BoostClassifier()
doublets = clf.fit(raw_adata_ori.X).predict()
raw_adata_ori.obs['doublet'] = pd.Categorical(doublets.astype(bool))
raw_adata_ori = raw_adata_ori[np.logical_not(list(raw_adata_ori.obs['doublet']))]
del raw_adata_ori.obs['doublet']

  0%|          | 0/25 [00:00<?, ?it/s]

In [17]:
raw_adata = raw_adata_ori
raw_adata

View of AnnData object with n_obs × n_vars = 110633 × 54894
    obs: 'batch', 'cell_id', 'sample_names', 'monkey'
    var: 'gene_ids'

In [18]:
AnnData.write(raw_adata, filename='./processed_data/raw_adata_afterDB.h5ad', compression=None, compression_opts=None, force_dense=None)

In [19]:
sc.pp.filter_cells(raw_adata, min_genes=200)
sc.pp.filter_genes(raw_adata, min_cells=5)
sc.pp.filter_genes(raw_adata, min_counts=15)
mito_genes = raw_adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
raw_adata.obs['percent.mt'] = np.sum(raw_adata[:, mito_genes].X, axis=1).A1 / np.sum(raw_adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
raw_adata.obs['nCount_RNA'] = raw_adata.X.sum(axis=1).A1
raw_adata

Trying to set attribute `.obs` of view, copying.


AnnData object with n_obs × n_vars = 110633 × 31121
    obs: 'batch', 'cell_id', 'sample_names', 'monkey', 'n_genes', 'percent.mt', 'nCount_RNA'
    var: 'gene_ids', 'n_cells', 'n_counts'

In [20]:
raw_adata.var['n_counts']

CICP27        1.800000e+01
AP006222.1    5.963097e+01
MTND1P23      1.448023e+03
MTND2P28      1.334119e+06
MTCO1P12      7.218649e+05
                  ...     
RPL18AP2      4.820137e+01
DIP2A         6.911676e+04
S100B         1.176395e+04
PRMT2         3.914368e+04
DSTNP1        1.942477e+01
Name: n_counts, Length: 31121, dtype: float32

In [21]:
raw_adata.var['n_cells']

CICP27           18
AP006222.1       77
MTND1P23       2127
MTND2P28      37093
MTCO1P12      34631
              ...  
RPL18AP2         48
DIP2A         45321
S100B         13485
PRMT2         36884
DSTNP1           20
Name: n_cells, Length: 31121, dtype: int64

In [22]:
raw_adata.obs['nCount_RNA']

AAACCCAAGCAAATGT-1-0    11324.132812
AAACCCAAGCCTCAGC-1-0    11302.038086
AAACCCACATGGATCT-1-0     5087.105957
AAACCCAGTAGCGCCT-1-0     3620.108398
AAACCCAGTCGTCATA-1-0     3198.436523
                            ...     
TTTGTGTTCTACCTAT-1-7     2039.548096
TTTGTTGGTCCTAACT-1-7     2358.319092
TTTGTTGGTCTCACTG-1-7     1258.595215
TTTGTTGGTTGCAATG-1-7     2958.033203
TTTGTTGGTTGTGATG-1-7     4242.266113
Name: nCount_RNA, Length: 110633, dtype: float32

In [23]:
ribo_genes = raw_adata.var_names.str.startswith(("RPS","RPL"))
print(sum(ribo_genes))
raw_adata.obs['percent_ribo'] = np.sum(raw_adata[:, ribo_genes].X, axis=1).A1 / np.sum(raw_adata.X, axis=1).A1
raw_adata = raw_adata[:,~ribo_genes]
raw_adata

362


View of AnnData object with n_obs × n_vars = 110633 × 30759
    obs: 'batch', 'cell_id', 'sample_names', 'monkey', 'n_genes', 'percent.mt', 'nCount_RNA', 'percent_ribo'
    var: 'gene_ids', 'n_cells', 'n_counts'

In [24]:
AnnData.write(raw_adata,filename="./processed_data/raw_adata_afterDB_colomns_added.h5ad", compression=None, compression_opts=None, force_dense=None)