In [3]:
import glob
import pandas as pd
import scanpy as sc
import scanpy.external as sce

In [4]:
# Output directory
out_dir = "/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/"

# Filtered object directory
scanpy_dir = "/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/scanpy_objects_w_metadata/"

In [3]:
# extract all files
scanpy_files = glob.glob(scanpy_dir+"S*")

# combine anndata objects by samples
datasets=[]
for file in scanpy_files:
    adata = sc.read(file)
    datasets.append(adata)
adata = datasets[0].concatenate(*datasets[1:])
# adata = datasets[0].concatenate(datasets[1:]) # This works 

In [4]:
# Notes: Making the concatenation work with higher number of samples
# more memory efficient (doesn't make 2 copies of each annData object)??
# adata = sc.read(scanpy_files[0])
# for file in scanpy_files[1:]:
#     adata.concatenate(sc.read(file)
# maybe I can use AnnCollection rather than concatenating all the datasets?
# this will create a concatenated view of the data without copying anything and should be more memory efficient
# from anndata.experimental.multi_files import AnnCollection
# Anncollection(datasets)

In [5]:
# Load gencode (v44) gene info file
gencode_text_file = '/share/ScratchGeneral/anncuo/reference_data/gencode.v44.basic.annotation_df.txt'
gene_info = pd.read_csv(gencode_text_file)

# Add gene info to AnnData
# set gene ids as indices
adata.var.index = [gene for gene in adata.var['gene_ids']]
genes_cellranger = adata.var.index
# Remove gene version
gene_info.index = [gene.split(".")[0] for gene in gene_info['gene_id']]
# Only consider genes in AnnData object
gene_info_df = gene_info[gene_info.index.isin(genes_cellranger)]
# Add info
adata.var = pd.concat([adata.var,gene_info_df], axis=1)

In [7]:
# write
out_file = f'{out_dir}/240_libraries_concatenated_gene_info.h5ad'
adata.write(out_file)

# Save list of all the cell types in the object, used to split the object later in the pipeline

In [5]:
adata = sc.read(f'{out_dir}/240_libraries_concatenated_gene_info.h5ad')

In [15]:
# save list of unique cell types to a .txt file 
unique_cell_types = adata.obs['wg2_scpred_prediction'].unique()

# Write unique cell types to a text file
with open(f'{out_dir}unique_cell_types_wg2_scpred.txt', 'w') as f:
    for cell_type in unique_cell_types:
        f.write(str(cell_type) + '\n')

# Checking that the new object has all data compared to the previous run

In [5]:
adata_240 = sc.read(f'{out_dir}/240_libraries_concatenated_gene_info.h5ad')

In [6]:
adata_240.obs.shape

(5084027, 39)

In [7]:
# how many pools?
adata_240.obs['batch'].nunique()

240

In [8]:
# how many individuals?
adata_240.obs['sequencing_library'].nunique()

240

In [9]:
adata_240.obs['individual'].nunique()

1812

In [10]:
adata_240.obs.columns

Index(['cellbender_background_fraction', 'cellbender_cell_probability',
       'cellbender_cell_size', 'cellbender_droplet_efficiency',
       'celltypist_predicted_labels', 'celltypist_over_clustering',
       'celltypist_majority_voting', 'celltypist_conf_score', 'wg2_sample',
       'wg2_nCount_RNA', 'wg2_nFeature_RNA', 'wg2_percent_mt',
       'wg2_azimuth_predicted_celltype_l2',
       'wg2_azimuth_predicted_celltype_l2_score', 'wg2_scpred_prediction',
       'Vireo_Individual_Assignment', 'Vireo_DropletType',
       'scDblFinder_DropletType', 'scDblFinder_Score', 'scds_score',
       'scds_DropletType', 'MajoritySinglet_DropletType',
       'MajoritySinglet_Individual_Assignment', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'original_barcode',
       'new_cell_name', 'sequencing_library', 'individual', 'cohort', 'ct_id',
       'cpg_id', 'onek1k_id', 'tob_id', 'onek1k_donor', 'batch'],
      dtype='object')

In [4]:
adata_224 = sc.read('/directflow/SCCGGroupShare/projects/anncuo/TenK10K_pilot/tenk10k/data_processing/integrated_objects/224_libraries/concatenated_gene_info_donor_info.h5ad')

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [5]:
adata_224.obs.shape

(4494637, 40)

In [6]:
# how many pools?
adata_224.obs['batch'].nunique()

224

In [7]:
# how many individuals?
adata_224.obs['sequencing_library'].nunique()

224

In [8]:
adata_224.obs['individual'].nunique()

1654

In [9]:
5084027 - 4494637

589390

In [10]:
adata_224.obs.columns

Index(['cellbender_background_fraction', 'cellbender_cell_probability',
       'cellbender_cell_size', 'cellbender_droplet_efficiency',
       'celltypist_predicted_labels', 'celltypist_over_clustering',
       'celltypist_majority_voting', 'celltypist_conf_score', 'wg2_sample',
       'wg2_nCount_RNA', 'wg2_nFeature_RNA', 'wg2_percent_mt',
       'wg2_azimuth_predicted_celltype_l2',
       'wg2_azimuth_predicted_celltype_l2_score', 'wg2_scpred_prediction',
       'Vireo_Individual_Assignment', 'Vireo_DropletType',
       'scDblFinder_DropletType', 'scDblFinder_Score', 'scds_score',
       'scds_DropletType', 'MajoritySinglet_DropletType',
       'MajoritySinglet_Individual_Assignment', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'original_barcode',
       'new_cell_name', 'sequencing_library', 'individual', 'cohort',
       'onek1k_id', 'cpg_id_old', 'tob_id', 'cpg_id', 'onek1k_donor', 'ct_id',
       'batch'],
      dtype='object')

In [12]:
adata_224.obs.columns.nunique()

40

In [12]:
obs_240 = ['cellbender_background_fraction', 'cellbender_cell_probability',
       'cellbender_cell_size', 'cellbender_droplet_efficiency',
       'celltypist_predicted_labels', 'celltypist_over_clustering',
       'celltypist_majority_voting', 'celltypist_conf_score', 'wg2_sample',
       'wg2_nCount_RNA', 'wg2_nFeature_RNA', 'wg2_percent_mt',
       'wg2_azimuth_predicted_celltype_l2',
       'wg2_azimuth_predicted_celltype_l2_score', 'wg2_scpred_prediction',
       'Vireo_Individual_Assignment', 'Vireo_DropletType',
       'scDblFinder_DropletType', 'scDblFinder_Score', 'scds_score',
       'scds_DropletType', 'MajoritySinglet_DropletType',
       'MajoritySinglet_Individual_Assignment', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'original_barcode',
       'new_cell_name', 'sequencing_library', 'individual', 'cohort', 'ct_id',
       'cpg_id', 'onek1k_id', 'tob_id', 'onek1k_donor', 'batch']
obs_224 = ['cellbender_background_fraction', 'cellbender_cell_probability',
       'cellbender_cell_size', 'cellbender_droplet_efficiency',
       'celltypist_predicted_labels', 'celltypist_over_clustering',
       'celltypist_majority_voting', 'celltypist_conf_score', 'wg2_sample',
       'wg2_nCount_RNA', 'wg2_nFeature_RNA', 'wg2_percent_mt',
       'wg2_azimuth_predicted_celltype_l2',
       'wg2_azimuth_predicted_celltype_l2_score', 'wg2_scpred_prediction',
       'Vireo_Individual_Assignment', 'Vireo_DropletType',
       'scDblFinder_DropletType', 'scDblFinder_Score', 'scds_score',
       'scds_DropletType', 'MajoritySinglet_DropletType',
       'MajoritySinglet_Individual_Assignment', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'original_barcode',
       'new_cell_name', 'sequencing_library', 'individual', 'cohort',
       'onek1k_id', 'cpg_id_old', 'tob_id', 'cpg_id', 'onek1k_donor', 'ct_id',
       'batch']

In [18]:
[print(i) for i in obs_224 if i not in obs_240]

cpg_id_old


[None]