In [1]:
import glob
import pandas as pd
import scanpy as sc
import scanpy.external as sce

In [2]:
# Output directory
out_dir = "/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/"

# Filtered object directory
scanpy_dir = "/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/scanpy_objects_w_metadata/"

In [3]:
# extract all files
scanpy_files = glob.glob(scanpy_dir+"S*")

# combine anndata objects by samples
datasets=[]
for file in scanpy_files:
    adata = sc.read(file)
    datasets.append(adata)
adata = datasets[0].concatenate(*datasets[1:])
# adata = datasets[0].concatenate(datasets[1:]) # This works 

In [4]:
# Notes: Making the concatenation work with higher number of samples
# more memory efficient (doesn't make 2 copies of each annData object)??
# adata = sc.read(scanpy_files[0])
# for file in scanpy_files[1:]:
#     adata.concatenate(sc.read(file)
# maybe I can use AnnCollection rather than concatenating all the datasets?
# this will create a concatenated view of the data without copying anything and should be more memory efficient
# from anndata.experimental.multi_files import AnnCollection
# Anncollection(datasets)

In [5]:
# Load gencode (v44) gene info file
gencode_text_file = '/share/ScratchGeneral/anncuo/reference_data/gencode.v44.basic.annotation_df.txt'
gene_info = pd.read_csv(gencode_text_file)

# Add gene info to AnnData
# set gene ids as indices
adata.var.index = [gene for gene in adata.var['gene_ids']]
genes_cellranger = adata.var.index
# Remove gene version
gene_info.index = [gene.split(".")[0] for gene in gene_info['gene_id']]
# Only consider genes in AnnData object
gene_info_df = gene_info[gene_info.index.isin(genes_cellranger)]
# Add info
adata.var = pd.concat([adata.var,gene_info_df], axis=1)

In [7]:
# write
out_file = f'{out_dir}/240_libraries_concatenated_gene_info.h5ad'
adata.write(out_file)

In [8]:
adata.obs['batch'].nunique()

240

In [10]:
adata.var_names

Index(['ENSG00000290825', 'ENSG00000243485', 'ENSG00000237613',
       'ENSG00000290826', 'ENSG00000186092', 'ENSG00000238009',
       'ENSG00000239945', 'ENSG00000239906', 'ENSG00000241860',
       'ENSG00000241599',
       ...
       'ENSG00000275249', 'ENSG00000274792', 'ENSG00000274175',
       'ENSG00000275869', 'ENSG00000273554', 'ENSG00000277836',
       'ENSG00000278633', 'ENSG00000276017', 'ENSG00000278817',
       'ENSG00000277196'],
      dtype='object', length=38592)

In [11]:
adata.obs_names

Index(['AAACCCAAGACTACCT_S0056a-0', 'AAACCCAAGAGTCTGG_S0056a-0',
       'AAACCCAAGATCACCT_S0056a-0', 'AAACCCAAGCCGCTTG_S0056a-0',
       'AAACCCAAGCCTCATA_S0056a-0', 'AAACCCAAGTCATAGA_S0056a-0',
       'AAACCCACAAGACCTT_S0056a-0', 'AAACCCACAAGCACCC_S0056a-0',
       'AAACCCACACAGACGA_S0056a-0', 'AAACCCACACTACGGC_S0056a-0',
       ...
       'TTTGTTGGTCTTGAGT_S0087-239', 'TTTGTTGGTGCCTAAT_S0087-239',
       'TTTGTTGGTGGTACAG_S0087-239', 'TTTGTTGGTTGAGTCT_S0087-239',
       'TTTGTTGGTTTCGTAG_S0087-239', 'TTTGTTGTCAAGTCGT_S0087-239',
       'TTTGTTGTCCCTTGTG_S0087-239', 'TTTGTTGTCCGCGAGT_S0087-239',
       'TTTGTTGTCGCTACAA_S0087-239', 'TTTGTTGTCTACGGTA_S0087-239'],
      dtype='object', length=5084027)

In [13]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,mt,n_cells_by_counts-0,mean_counts-0,pct_dropout_by_counts-0,total_counts-0,n_cells_by_counts-1,mean_counts-1,...,type,start,end,score,strand,phase,attributes,gene_id,gene_name,gene_type
ENSG00000290825,ENSG00000290825,Gene Expression,GRCh38,False,1,0.000055,99.994510,1.0,0,0.000000,...,gene,11869.0,14409.0,.,+,.,"gene_id ""ENSG00000290825.1""; gene_type ""lncRNA...",ENSG00000290825.1,DDX11L2,lncRNA
ENSG00000243485,ENSG00000243485,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,gene,29554.0,31109.0,.,+,.,"gene_id ""ENSG00000243485.5""; gene_type ""lncRNA...",ENSG00000243485.5,MIR1302-2HG,lncRNA
ENSG00000237613,ENSG00000237613,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,gene,34554.0,36081.0,.,-,.,"gene_id ""ENSG00000237613.2""; gene_type ""lncRNA...",ENSG00000237613.2,FAM138A,lncRNA
ENSG00000290826,ENSG00000290826,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,gene,57598.0,64116.0,.,+,.,"gene_id ""ENSG00000290826.1""; gene_type ""lncRNA...",ENSG00000290826.1,ENSG00000290826,lncRNA
ENSG00000186092,ENSG00000186092,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,gene,65419.0,71585.0,.,+,.,"gene_id ""ENSG00000186092.7""; gene_type ""protei...",ENSG00000186092.7,OR4F5,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277836,ENSG00000277836,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,,,,,,,,,,
ENSG00000278633,ENSG00000278633,Gene Expression,GRCh38,False,0,0.000000,100.000000,0.0,0,0.000000,...,,,,,,,,,,
ENSG00000276017,ENSG00000276017,Gene Expression,GRCh38,False,1,0.000055,99.994510,1.0,2,0.000093,...,,,,,,,,,,
ENSG00000278817,ENSG00000278817,Gene Expression,GRCh38,False,201,0.011584,98.896453,211.0,198,0.009309,...,,,,,,,,,,
