In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "HLCA_v1_core"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)

In [6]:
adata.obs['dataset'].value_counts(dropna=False)

Banovich_Kropski_2020        121894
Barbry_Leroy_2020             74487
Nawijn_2021                   70402
Misharin_2021                 64843
Krasnow_2020                  60982
Misharin_Budinger_2018        41220
Meyer_2019                    35554
Jain_Misharin_2021_10Xv2      33135
Seibold_2020_10Xv3            21466
Lafyatis_Rojas_2019_10Xv2     21260
Jain_Misharin_2021_10Xv1      12422
Teichmann_Meyer_2019          12231
Seibold_2020_10Xv2            12127
Lafyatis_Rojas_2019_10Xv1      2921
Name: dataset, dtype: int64

In [8]:
barcodes = [x+"-HLCA_v1_core" for x in adata.obs.index]
dataset_ids = adata.obs['dataset']

df = pd.DataFrame({'barcode': barcodes, 'dataset_id': dataset_ids})

df.to_csv("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/HLCA_v1_core_barcodes.csv", index=False)

In [4]:
adata.obs.columns

Index(['suspension_type', 'donor_id', 'is_primary_data',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id',
       'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI',
       'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score',
       'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1',
       'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5',
       'cause_of_death', 'dataset', 'entropy_dataset_leiden_3',
       'entropy_original_ann_level_1_leiden_3',
       'entropy_original_ann_level_2_clean_leiden_3',
       'entropy_original_ann_level_3_clean_leiden_3',
       'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1',
       'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts',
       'lung_condition', 'mixed_ancestry', 'n_genes_detected',
       'o

In [6]:
# show mappings between cell type annotations and cell type ontology terms
pd.set_option('display.max_rows', None)

ann_levels = [
    'ann_level_1',
    'ann_level_2',
    'ann_level_3',
    'ann_level_4',
    'ann_level_5',
    'ann_finest_level'
]

df_all = None

for ann_level in ann_levels:
    df = adata.obs[[ann_level, 'cell_type_ontology_term_id']].drop_duplicates(subset=ann_level)
    df.rename(columns={ann_level: 'cell_type_annotation'}, inplace=True)
    if ann_level == 'ann_level_1':
        df_all = df
    else:
        df_all = pd.concat([df_all, df], ignore_index=True)

df_all

Unnamed: 0,cell_type_annotation,cell_type_ontology_term_id
0,Immune,CL:0000583
1,Epithelial,CL:0002063
2,Endothelial,CL:0002543
3,Stroma,CL:2000093
4,Myeloid,CL:0000583
5,Lymphoid,CL:0000623
6,Alveolar epithelium,CL:0002063
7,Airway epithelium,CL:0002633
8,Blood vessels,CL:0002543
9,Fibroblast lineage,CL:2000093


In [7]:
df_all.to_csv('hlca_v1_annot_ontology_id_mapping_all.csv', index=False)

# Add comparable author cell type and ontology ID labels into obs

In [18]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['ann_finest_level']
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

Alveolar macrophages          68487
AT2                           61429
Suprabasal                    41158
Basal resting                 38955
Goblet (nasal)                35833
                              ...  
Mesothelium                     230
Tuft                            165
Neuroendocrine                  159
Hematopoietic stem cells         60
Lymphatic EC proliferating       28
Name: author_cell_type, Length: 61, dtype: int64

In [19]:
# show column names with 'ontology' in it
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002633    80113
CL:0000583    78816
CL:0002063    62405
CL:0000158    36023
CL:0002480    35833
CL:0002145    35225
CL:0000625    29074
CL:0000861    28223
CL:0002144    23205
CL:0000624    21285
CL:0000860    17695
CL:0000623    16978
CL:0002543    12975
CL:4028006    10321
CL:0002399     9133
CL:0000875     8834
CL:0002062     7937
CL:1001568     7391
CL:0000097     6623
CL:0005012     5873
CL:4028004     5182
CL:1001603     4805
CL:4030023     4600
CL:0002138     4595
CL:0000236     4511
CL:0002632     4393
CL:0009089     3032
CL:0019019     2996
CL:0000786     1773
CL:1000312     1670
CL:2000093     1573
CL:0000313     1472
CL:0010003     1440
CL:0019001     1417
CL:0000622     1274
CL:0019003      968
CL:0000186      716
CL:0005006      561
CL:0000192      556
CL:0000784      552
CL:0000319      537
CL:0000084      500
CL:0000499      335
CL:0000990      322
CL:0000451      312
CL:0000057      276
CL:0000077      230
CL:0002075      165
CL:1000223      159
CL:0000037       60


# Check var

In [6]:
adata.var

Unnamed: 0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
ENSG00000000003,False,TSPAN6,NCBITaxon:9606,gene,4530
ENSG00000000005,False,TNMD,NCBITaxon:9606,gene,1476
ENSG00000000419,False,DPM1,NCBITaxon:9606,gene,9276
ENSG00000000457,False,SCYL3,NCBITaxon:9606,gene,6883
ENSG00000000460,False,C1orf112,NCBITaxon:9606,gene,5970
...,...,...,...,...,...
ENSG00000283078,False,ENSG00000283078.1,NCBITaxon:9606,gene,1608
ENSG00000283103,False,ENSG00000283103.5,NCBITaxon:9606,gene,4585
ENSG00000283117,False,MGC4859,NCBITaxon:9606,gene,3118
ENSG00000283118,False,ENSG00000283118.1,NCBITaxon:9606,gene,644


In [7]:
adata.var['ensembl_id'] = adata.var.index.astype(str)
adata.var.rename(columns={'feature_name': 'gene_symbol'}, inplace=True)
adata.var.index.name = 'index'

adata.var

Unnamed: 0_level_0,feature_is_filtered,gene_symbol,feature_reference,feature_biotype,feature_length,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,False,TSPAN6,NCBITaxon:9606,gene,4530,ENSG00000000003
ENSG00000000005,False,TNMD,NCBITaxon:9606,gene,1476,ENSG00000000005
ENSG00000000419,False,DPM1,NCBITaxon:9606,gene,9276,ENSG00000000419
ENSG00000000457,False,SCYL3,NCBITaxon:9606,gene,6883,ENSG00000000457
ENSG00000000460,False,C1orf112,NCBITaxon:9606,gene,5970,ENSG00000000460
...,...,...,...,...,...,...
ENSG00000283078,False,ENSG00000283078.1,NCBITaxon:9606,gene,1608,ENSG00000283078
ENSG00000283103,False,ENSG00000283103.5,NCBITaxon:9606,gene,4585,ENSG00000283103
ENSG00000283117,False,MGC4859,NCBITaxon:9606,gene,3118,ENSG00000283117
ENSG00000283118,False,ENSG00000283118.1,NCBITaxon:9606,gene,644,ENSG00000283118


# Check raw data

In [8]:
adata.raw.X.toarray().max()

34950.0

In [9]:
adata.X = adata.raw.X

In [10]:
adata.X = adata.X.astype(np.int32)

In [11]:
adata.raw = adata

In [12]:
adata.X

<584944x27957 sparse matrix of type '<class 'numpy.int32'>'
	with 1139652728 stored elements in Compressed Sparse Row format>

In [13]:
adata.raw.X

<584944x27957 sparse matrix of type '<class 'numpy.int32'>'
	with 1139652728 stored elements in Compressed Sparse Row format>

In [14]:
adata.X.toarray().max()

34950

In [15]:
adata.raw.X.toarray().max()

34950

# Data Status

- CHECK: Raw data in X
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: ENSEMBL IDs in var

In [10]:
adata.obs['sample_ID'] = adata.obs['sample']

In [11]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')

In [12]:
adata.write_zarr("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced_zarr/HLCA_v1_core.zarr")

  warn(f"ignoring keyword argument {k!r}")


In [5]:
import scanpy as sc
import anndata as ad
from os.path import join

adata_1 = sc.read_h5ad(join(OUTPUT_PATH_PREREVISION, "lungMAP.h5ad"))
adata_2 = sc.read_h5ad(join(OUTPUT_PATH_PREREVISION, "Tabula_Sapiens_2022_publ.h5ad"))



In [16]:
adatas = {'lungMAP': adata_1, 'Tabula_Sapiens_2022_publ': adata_2}

for adata in adatas.values():
    adata.var.index = adata.var['gene_symbol']
    adata.var.index.name = 'index'

# concatenate adatas with outer join

adata_merged = ad.concat(
    adatas,
    join='outer',
    label='dataset_ID',
    index_unique='_'
)

In [12]:
adata_1.var

Unnamed: 0_level_0,gene_symbol
index,Unnamed: 1_level_1
MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A
OR4F5,OR4F5
AL627309.1,AL627309.1
AL627309.3,AL627309.3
...,...
AC141272.1,AC141272.1
AC023491.2,AC023491.2
AC007325.1,AC007325.1
AC007325.4,AC007325.4


In [13]:
adata_2.var

Unnamed: 0_level_0,feature_type,highly_variable,means,dispersions,dispersions_norm,mean,std,feature_is_filtered,gene_symbol,feature_reference,feature_biotype,feature_length,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DDX11L1,Gene Expression,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574,False,DDX11L1,NCBITaxon:9606,gene,632,ENSG00000223972
WASH7P,Gene Expression,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731,False,WASH7P,NCBITaxon:9606,gene,1351,ENSG00000227232
MIR6859-1,Gene Expression,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634,False,MIR6859-1,NCBITaxon:9606,gene,68,ENSG00000278267
MIR1302-2HG,Gene Expression,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041,False,MIR1302-2HG,NCBITaxon:9606,gene,1021,ENSG00000243485
MIR1302-2,Gene Expression,False,1.000000e-12,,0.000000,0.000000,1.000000,False,MIR1302-2,NCBITaxon:9606,gene,138,ENSG00000284332
...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,Gene Expression,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395,False,MT-ND6,NCBITaxon:9606,gene,525,ENSG00000198695
MT-TE,Gene Expression,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820,False,MT-TE,NCBITaxon:9606,gene,69,ENSG00000210194
MT-CYB,Gene Expression,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192,False,MT-CYB,NCBITaxon:9606,gene,1141,ENSG00000198727
MT-TT,Gene Expression,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848,False,MT-TT,NCBITaxon:9606,gene,66,ENSG00000210195


In [19]:
adata_merged.obs['dataset_ID']

Donor27_Donor27_AAACGAACACCGTGCA-1_lungMAP                                                       lungMAP
Donor27_Donor27_AAAGAACAGCGCCATC-1_lungMAP                                                       lungMAP
Donor27_Donor27_AAAGAACCAGGCAATG-1_lungMAP                                                       lungMAP
Donor27_Donor27_AAAGGATAGTAGCTCT-1_lungMAP                                                       lungMAP
Donor27_Donor27_AAAGTCCCATCCGGCA-1_lungMAP                                                       lungMAP
                                                                                          ...           
TTTGTTGTCAAGCCCG_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ    Tabula_Sapiens_2022_publ
TTTGTTGTCGTCAACA_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ    Tabula_Sapiens_2022_publ
TTTGTTGTCTACCACC_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ    Tabula_Sapiens_2022_publ
TTTGTTGTCTAGCCAA_TSP2_Lung_proxmedialdistal_10X_1_2_Tab

In [17]:
adata_merged.var

5S_rRNA_ENSG00000276861
5S_rRNA_ENSG00000277411
5S_rRNA_ENSG00000277488
5S_rRNA_ENSG00000285609
5S_rRNA_ENSG00000285626
...
ZZZ3
hsa-mir-1253
hsa-mir-423
hsa-mir-8069-1
snoZ196


In [27]:
adata_1.var

Unnamed: 0_level_0,gene_symbol
index,Unnamed: 1_level_1
MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A
OR4F5,OR4F5
AL627309.1,AL627309.1
AL627309.3,AL627309.3
...,...
AC141272.1,AC141272.1
AC023491.2,AC023491.2
AC007325.1,AC007325.1
AC007325.4,AC007325.4


In [28]:
adata_2.var

Unnamed: 0_level_0,feature_type,highly_variable,means,dispersions,dispersions_norm,mean,std,feature_is_filtered,gene_symbol,feature_reference,feature_biotype,feature_length,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DDX11L1,Gene Expression,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574,False,DDX11L1,NCBITaxon:9606,gene,632,ENSG00000223972
WASH7P,Gene Expression,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731,False,WASH7P,NCBITaxon:9606,gene,1351,ENSG00000227232
MIR6859-1,Gene Expression,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634,False,MIR6859-1,NCBITaxon:9606,gene,68,ENSG00000278267
MIR1302-2HG,Gene Expression,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041,False,MIR1302-2HG,NCBITaxon:9606,gene,1021,ENSG00000243485
MIR1302-2,Gene Expression,False,1.000000e-12,,0.000000,0.000000,1.000000,False,MIR1302-2,NCBITaxon:9606,gene,138,ENSG00000284332
...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,Gene Expression,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395,False,MT-ND6,NCBITaxon:9606,gene,525,ENSG00000198695
MT-TE,Gene Expression,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820,False,MT-TE,NCBITaxon:9606,gene,69,ENSG00000210194
MT-CYB,Gene Expression,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192,False,MT-CYB,NCBITaxon:9606,gene,1141,ENSG00000198727
MT-TT,Gene Expression,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848,False,MT-TT,NCBITaxon:9606,gene,66,ENSG00000210195


In [29]:
adata_merged.var

5S_rRNA_ENSG00000276861
5S_rRNA_ENSG00000277411
5S_rRNA_ENSG00000277488
5S_rRNA_ENSG00000285609
5S_rRNA_ENSG00000285626
...
ZZZ3
hsa-mir-1253
hsa-mir-423
hsa-mir-8069-1
snoZ196


In [31]:
adata_merged.uns['author_cell_type_markers']

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,Comment,full_name,abbreviation,author_cell_type_markers
0,Alveolar fibroblast 2 (AF2),CL:4028006,,Alveolar fibroblast 2,AF2,MFAP5; SCARA5; CDON; DCN; PLA2G2A; SFRP2; LUM;...
1,Alveolar fibroblast 1 (AF1),CL:4028004,,Alveolar fibroblast 1,AF1,TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; P...
2,Alveolar macrophage (AM),CL:0000583,,Alveolar macrophage,AM,FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; ...
3,Alveolar type 1 cell (AT1),CL:0002062,,Alveolar type 1 cell,AT1,AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS...
4,Alveolar type 2 cell (AT2),CL:0002063,,Alveolar type 2 cell,AT2,ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; S...
5,B cell (B),CL:0000236,,B cell,B,BANK1; MS4A1; CD19; BACH2; IGHM; EBF1; CD79A; ...
6,Basal cell (Basal),CL:0002633,,Basal cell,Basal,KRT5; TP63; NGFR; KRT15; KRT17; MMP10; S100A2;...
7,CD4+ T cell (CD4 T),CL:0000624,,CD4+ T cell,CD4 T,CD3E; LEF1; CD40LG; MAL; CD4; CD69; ITK; LTB; ...
8,CD8+ T cell (CD8 T),CL:0000625,,CD8+ T cell,CD8 T,CD8A; CD3E; CD8B; CCL5; NKG7; GZMA; GZMH; IL32...
9,Inflammatory monocyte (iMON),CL:0000860,,Inflammatory monocyte,iMON,VCAN; FCN1; CD14; S100A8; JARID2; S100A12; THB...


In [23]:
dict(adata_merged.uns).keys()

dict_keys(['author_cell_type_markers', 'author_cell_type_markers_level_2', 'comments', '_scvi', '_training_mode', 'citation', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_id_colors', 'hvg', 'neighbors', 'schema_reference', 'schema_version', 'tissue_in_publication_colors', 'umap'])

In [21]:
adata_merged

AnnData object with n_obs × n_vars = 156537 × 67983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'S.Score', 'G2M.Score', 'Phase', 'pMT', 'DataID', 'DonorID', 'Dataset', 'Age', 'Sex', 'lineage_level1', 'lineage_level2', 'celltype_level1', 'celltype_level2', 'celltype_level3', 'celltype_level3_fullname', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_d

In [14]:
adata_merged.obs.index

Index(['Donor27_Donor27_AAACGAACACCGTGCA-1_lungMAP',
       'Donor27_Donor27_AAAGAACAGCGCCATC-1_lungMAP',
       'Donor27_Donor27_AAAGAACCAGGCAATG-1_lungMAP',
       'Donor27_Donor27_AAAGGATAGTAGCTCT-1_lungMAP',
       'Donor27_Donor27_AAAGTCCCATCCGGCA-1_lungMAP',
       'Donor27_Donor27_AAAGTGACACGGCCAT-1_lungMAP',
       'Donor27_Donor27_AAATGGACAAAGGAGA-1_lungMAP',
       'Donor27_Donor27_AAATGGACACAAGCCC-1_lungMAP',
       'Donor27_Donor27_AACAACCGTCTCACAA-1_lungMAP',
       'Donor27_Donor27_AACAACCGTGACTATC-1_lungMAP',
       ...
       'TTTGGTTTCCAGCTCT_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ',
       'TTTGGTTTCGAGATAA_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ',
       'TTTGTTGAGCCTAGGA_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ',
       'TTTGTTGAGGCCACTC_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ',
       'TTTGTTGGTTCCGCTT_TSP2_Lung_proxmedialdistal_10X_1_2_Tabula_Sapiens_2022_publ',
       'TTTGTTGTCAAGCCCG

In [15]:
adata_merged.obs['dataset'].value_counts()

lungMAP                     122445
Tabula_Sapiens_2022_publ     34092
Name: dataset, dtype: int64

In [17]:
adata_merged.var

5S_rRNA_ENSG00000276861
5S_rRNA_ENSG00000277411
5S_rRNA_ENSG00000277488
5S_rRNA_ENSG00000285609
5S_rRNA_ENSG00000285626
...
ZZZ3
hsa-mir-1253
hsa-mir-423
hsa-mir-8069-1
snoZ196


In [4]:
adata = sc.read_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

In [9]:
adata.obs['sample'].value_counts(dropna=False).index.tolist()

['SC144',
 'distal 2',
 'distal 3',
 'SC182',
 'GRO-09_biopsy',
 'GRO-10_biopsy',
 'medial 2',
 'GRO-03_biopsy',
 'VUHD67',
 'F02617',
 'F02611',
 'F01851',
 'GRO-04_biopsy',
 'SC184',
 'VUHD68',
 'proximal 3',
 'F02522',
 'distal 1a',
 'SC84',
 'F02524',
 'SC27',
 'SC183',
 'F02607',
 'F01394',
 'SC156',
 'SC85',
 'SC86',
 'SC07',
 'SC88',
 'SC22',
 'SC142',
 'SC20',
 'SC18',
 '390C_12h',
 'GRO-09_nasal_brush',
 'SC173',
 'SC29',
 'D353_Brus_Nas1',
 'GRO-07_biopsy',
 'SC141',
 'SC56',
 'D353_Brus_Dis1',
 'GRO-04_nasal_brush',
 'D372_Biop_Pro1',
 'F02528',
 'SC155',
 'SC24',
 'F01607',
 'D353_Biop_Pro1',
 'SC143',
 'SC10',
 'D372_Biop_Int2',
 '356C_0h',
 '356C_24h',
 'SC87',
 'T85',
 '390C_0h',
 'GRO-06_biopsy',
 'F02526',
 '356C_12h',
 'VUHD66',
 'T153',
 'SC185',
 'SC45',
 '390C_72h',
 '368C_12h',
 'T101',
 'SC59',
 'F01639',
 'T164',
 'SC174_SC172',
 'D339_Biop_Int1',
 'D363_Brus_Nas1',
 'VUHD101',
 'SC181',
 'D367_Biop_Pro1',
 'GRO-08_biopsy',
 'T167',
 'D326_Biop_Pro1',
 'D344_Bru

### Adjustment for celltypist

In [2]:
adata = sc.read_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/HLCA_v1_core.h5ad")

In [6]:
del adata.obsm['X_scanvi_emb']
del adata.obsm['X_umap']
del adata.obsp['connectivities']
del adata.obsp['distances']

In [8]:
adata.write_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/HLCA_v1_core.h5ad")