In [1]:
import scanpy as sc
import anndata
from anndata import AnnData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
from collections import Counter
import seaborn as sns
sc.settings.verbosity = 0 
sc.settings.set_figure_params(dpi=150, color_map='viridis')  
sc.logging.print_versions()

scanpy==1.4.3 anndata==0.6.22.post1 umap==0.3.9 numpy==1.16.2 scipy==1.2.1 pandas==0.23.4 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [2]:
pwd

'/pylon5/mc5plqp/jih8/Peanut_b2/Peanut_Fiona'

## Starting from 10X Dataset

In [3]:
raw_adata1 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_caudate_01',
                       var_names='gene_symbols',                  
                        cache=True)

In [4]:
raw_adata2 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_nacc_02',
                       var_names='gene_symbols',                  
                        cache=True)

In [5]:
raw_adata3 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_putamen_3',
                       var_names='gene_symbols',                 
                        cache=True)

In [6]:
raw_adata4 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_caudate_04',
                       var_names='gene_symbols',                  
                        cache=True)

In [7]:
raw_adata5 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_nacc_5',
                       var_names='gene_symbols',                  
                        cache=True)

In [8]:
raw_adata6 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/Peanut_b2/1_soupx/strainedCounts_putamen_6',
                       var_names='gene_symbols',                  
                        cache=True)

In [9]:
raw_adata7 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/tom_fiona/new_ref/cellrangercountedfiles/nacc/filtered_feature_bc_matrix',
                       var_names='gene_symbols',                  
                        cache=True)

In [10]:
raw_adata8 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/tom_fiona/new_ref/cellrangercountedfiles/caudate/filtered_feature_bc_matrix',
                       var_names='gene_symbols',                  
                        cache=True)

In [11]:
raw_adata9 = sc.read_10x_mtx('/pylon5/mc5plqp/jih8/tom_fiona/new_ref/cellrangercountedfiles/putamen/filtered_feature_bc_matrix',
                       var_names='gene_symbols',                  
                        cache=True)

In [12]:
#join data together
raw_adata_ori = raw_adata1.concatenate(raw_adata2, raw_adata3,raw_adata4, raw_adata5,raw_adata6,raw_adata7,raw_adata8,raw_adata9)
raw_adata_ori

AnnData object with n_obs × n_vars = 85299 × 53573 
    obs: 'batch'
    var: 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'feature_types-6', 'gene_ids-7', 'feature_types-7', 'gene_ids-8', 'feature_types-8'

In [13]:
#give cells (number ids)
raw_adata_ori.obs['cell_id'] = range(raw_adata_ori.obs.shape[0])
raw_adata_ori

AnnData object with n_obs × n_vars = 85299 × 53573 
    obs: 'batch', 'cell_id'
    var: 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'feature_types-6', 'gene_ids-7', 'feature_types-7', 'gene_ids-8', 'feature_types-8'

In [14]:
#give another colum 
sample_ids = list(map(int,raw_adata_ori.obs['batch']))
sample_names = ['caudate_01','nacc_01','putamen_01','caudate_02','nacc_02','putamen_02','Nacc_Fiona','Caudate_Fiona','Putamen_Fiona']
sample_names = pd.Series([sample_names[sample_id] for sample_id in sample_ids])
raw_adata_ori.obs['sample_names'] = list(sample_names)
raw_adata_ori

AnnData object with n_obs × n_vars = 85299 × 53573 
    obs: 'batch', 'cell_id', 'sample_names'
    var: 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'feature_types-6', 'gene_ids-7', 'feature_types-7', 'gene_ids-8', 'feature_types-8'

In [15]:
#give another colum 
sample_ids = list(map(int,raw_adata_ori.obs['batch']))
region_names = ['caudate','nacc','putamen','caudate','nacc','putamen','nacc_Fiona','caudate_Fiona','putamen_Fiona']
region_names = pd.Series([region_names[sample_id] for sample_id in sample_ids])
raw_adata_ori.obs['regions'] = list(region_names)
raw_adata_ori

AnnData object with n_obs × n_vars = 85299 × 53573 
    obs: 'batch', 'cell_id', 'sample_names', 'regions'
    var: 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'feature_types-6', 'gene_ids-7', 'feature_types-7', 'gene_ids-8', 'feature_types-8'

Doublet removal by doubledetection

In [16]:
import doubletdetection
clf = doubletdetection.BoostClassifier()
doublets = clf.fit(raw_adata_ori.X).predict()
raw_adata_ori.obs['doublet'] = pd.Categorical(doublets.astype(bool))
raw_adata_ori = raw_adata_ori[np.logical_not(list(raw_adata_ori.obs['doublet']))]

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [17]:
raw_adata = raw_adata_ori
raw_adata

View of AnnData object with n_obs × n_vars = 80902 × 53573 
    obs: 'batch', 'cell_id', 'sample_names', 'regions', 'doublet'
    var: 'gene_ids-0', 'gene_ids-1', 'gene_ids-2', 'gene_ids-3', 'gene_ids-4', 'gene_ids-5', 'gene_ids-6', 'feature_types-6', 'gene_ids-7', 'feature_types-7', 'gene_ids-8', 'feature_types-8'

In [18]:
pwd

'/pylon5/mc5plqp/jih8/Peanut_b2/Peanut_Fiona'

In [19]:
AnnData.write(raw_adata, filename='/pylon5/mc5plqp/jih8/Peanut_b2/Peanut_Fiona/raw_adata_afterDB_beforePre_SoupX.h5ad', compression=None, compression_opts=None, force_dense=None)

Trying to set attribute `.obs` of view, making a copy.
... storing 'sample_names' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'regions' as categorical
Trying to set attribute `.var` of view, making a copy.
... storing 'feature_types-6' as categorical
Trying to set attribute `.var` of view, making a copy.
... storing 'feature_types-7' as categorical
Trying to set attribute `.var` of view, making a copy.
... storing 'feature_types-8' as categorical
