In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
from pathlib import Path
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

pd.set_option('display.max_rows', 100)

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Sauler_unpubl"
MTX_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.mtx"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [42]:
rds_dir = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/"
rds_files = [rds_dir+x for x in os.listdir(rds_dir) if x.endswith('RDS')]
rds_files

['/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellVein.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellFibroblast_Adventitial.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellgCap.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellSecretory.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellPericyte.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellCiliated.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellAM.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellAT2.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellGoblet.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellSMC.RDS',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/rds_files/cellAerocyte.RDS',
 '/home/ic

In [3]:
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [45]:
%%R -i rds_files

mtx_dir = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/mtx_files/"

for (rds_file in rds_files) {
    rds = readRDS(rds_file)
    barcodes = colnames(rds)
    rds_basename = basename(rds_file)
    rds_basename = sub("\\..*$", "", rds_basename)
    rds_basename = substr(rds_basename, 5, nchar(rds_basename))
    print(rds_basename)
    print(length(barcodes))
    print(dim(rds))
    write.csv(barcodes, paste0(mtx_dir, rds_basename, "_barcodes.csv"))
    writeMM(rds, paste0(mtx_dir, rds_basename, ".mtx"))
}

[1] "Vein"
[1] 2171
[1] 36601  2171
[1] "Fibroblast_Adventitial"
[1] 5797
[1] 36601  5797
[1] "gCap"
[1] 9131
[1] 36601  9131
[1] "Secretory"
[1] 48914
[1] 36601 48914
[1] "Pericyte"
[1] 1249
[1] 36601  1249
[1] "Ciliated"
[1] 31003
[1] 36601 31003
[1] "AM"
[1] 153741
[1]  36601 153741
[1] "AT2"
[1] 596855
[1]  36601 596855
[1] "Goblet"
[1] 9800
[1] 36601  9800
[1] "SMC"
[1] 3545
[1] 36601  3545
[1] "Aerocyte"
[1] 8001
[1] 36601  8001
[1] "T"
[1] 10754
[1] 36601 10754
[1] "Basal"
[1] 2584
[1] 36601  2584
[1] "Mast"
[1] 786
[1] 36601   786
[1] "ABC"
[1] 4155
[1] 36601  4155
[1] "Fibroblast_PB"
[1] 2454
[1] 36601  2454
[1] "Monocyte"
[1] 16477
[1] 36601 16477
[1] "B_plasma"
[1] 1624
[1] 36601  1624
[1] "Fibroblast_FRC"
[1] 528
[1] 36601   528
[1] "Lymphatic"
[1] 7925
[1] 36601  7925
[1] "Mesothelial"
[1] 3945
[1] 36601  3945
[1] "IM"
[1] 12557
[1] 36601 12557
[1] "Systemic"
[1] 1523
[1] 36601  1523
[1] "NK"
[1] 2313
[1] 36601  2313
[1] "Artery"
[1] 2891
[1] 36601  2891
[1] "Fibroblast_Al

In [46]:
import scipy.io
import scipy.sparse

def read_mtx_file(file_path):
    return scipy.io.mmread(file_path)

dir_mtx = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/mtx_files/"

cell_types = [x.split('.')[0] for x in os.listdir(dir_mtx) if x.endswith('mtx')]

concatenated_matrix = None
cell_type_annotation = []
barcodes = []

for idx, cell_type in enumerate(cell_types):
    if idx == 0:
        print(f"First cell type: {cell_type}")
        concatenated_matrix = read_mtx_file(join(dir_mtx, f"{cell_type}.mtx"))
        barcodes += pd.read_csv(join(dir_mtx, f"{cell_type}_barcodes.csv"))['x'].tolist()
        cell_num = concatenated_matrix.shape[1]
        cell_type_annotation += [cell_type] * cell_num
        print(f"First cell type mtx shape: {concatenated_matrix.shape}")
        print(f"First cell type barcodes: {len(barcodes)}")
        print(f"First cell type cell num: {cell_num}")
        print("-----------------------------------------")
    else:
        print(f"Current cell type: {cell_type}")
        mtx = read_mtx_file(join(dir_mtx, f"{cell_type}.mtx"))
        concatenated_matrix = scipy.sparse.hstack([concatenated_matrix, mtx])
        barcodes += pd.read_csv(join(dir_mtx, f"{cell_type}_barcodes.csv"))['x'].tolist()
        cell_num = mtx.shape[1]
        cell_type_annotation += [cell_type] * cell_num
        print(f"Current cell type mtx shape: {mtx.shape}")
        print(f"Current cell type barcodes: {len(barcodes)}")
        print(f"Current cell type cell num: {cell_num}")
        print("-----------------------------------------")

First cell type: Fibroblast_Adventitial
First cell type mtx shape: (36601, 5797)
First cell type barcodes: 5797
First cell type cell num: 5797
-----------------------------------------
Current cell type: Fibroblast_CTHRC1
Current cell type mtx shape: (36601, 2311)
Current cell type barcodes: 8108
Current cell type cell num: 2311
-----------------------------------------
Current cell type: ABC
Current cell type mtx shape: (36601, 4155)
Current cell type barcodes: 12263
Current cell type cell num: 4155
-----------------------------------------
Current cell type: NK
Current cell type mtx shape: (36601, 2313)
Current cell type barcodes: 14576
Current cell type cell num: 2313
-----------------------------------------
Current cell type: Fibroblast_PB
Current cell type mtx shape: (36601, 2454)
Current cell type barcodes: 17030
Current cell type cell num: 2454
-----------------------------------------
Current cell type: Fibroblast_Alveolar
Current cell type mtx shape: (36601, 38338)
Current ce

In [53]:
# save sparse matrix
scipy.io.mmwrite("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/count_matrix_all.mtx", concatenated_matrix)

In [51]:
# cell type annotation to dataframe
cell_type_annotation_df = pd.DataFrame(cell_type_annotation, columns=["cell_type"])
cell_type_annotation_df.to_csv("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/cell_type_annotation.csv", index=False)

In [52]:
barcodes_df = pd.DataFrame(barcodes, columns=["barcode"])
barcodes_df.to_csv("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/barcodes.csv", index=False)

In [21]:
# merged in separate script:
adata = sc.read_h5ad("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/adata_raw.h5ad")

In [23]:
adata.obs['sample_ID'] = [x.split('_')[0] for x in adata.obs.index]

In [24]:
pd.set_option('display.max_rows', None)
adata.obs['sample_ID'].value_counts(dropna=False)

JN10XHT126    41152
JN10XHT96     25920
JN10XHT38     24653
JN10XHT52     23215
JN10XHT123    22563
JN10XHT22     21997
JN10XHT143    21902
JN10XHT27     19992
JN10XHT144    18758
JN10XHT124    18667
JN10XHT160    18574
JN10XHT122    18513
JN10XHT154    18327
JN10XHT95     18300
JN10XHT50     18267
JN10XHT98     18226
JN10XHT62     18123
JN10XHT133    17454
JN10XHT49     17450
JN10XHT139    17361
JN10XHT85     16950
JN10XHT102    16711
JN10XHT21     16595
JN10XHT9      15776
JN10XHT138    15589
JN10XHT125    15483
JN10XHT136    15210
JN10XHT26     15146
JN10XHT111    15137
JN10XHT119    14998
JN10XHT116    14970
JN10XHT99     14945
JN10XHT147    14796
JN10XHT120    14707
JN10XHT114    14411
JN10XHT57     14398
JN10XHT145    13955
JN10XHT91     13878
JN10XHT28     13865
JN10XHT97     13851
JN10XHT141    13830
JN10XHT131    13771
JN10XHT87     13578
JN10XHT40     13437
JN10XHT30     13269
JN10XHT132    13254
JN10XHT79     13217
JN10XHT14     13125
JN10XHT129    12917
JN10XHT164    12805


# Validate obs and uns from Tier 1 Metadata Template

In [25]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,single NUQ RNA sequencing,Maor Sauler,,,"UNPUBLISHED Data, Protected under embargo",


In [26]:
obs['donor_id'] = obs['donor_id'].astype(str)

val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,JN10XHT1,121,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
1,JN10XHT10,71,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
2,JN10XHT100,45,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
3,JN10XHT101,46,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
4,JN10XHT102,58,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
5,JN10XHT103,142,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
6,JN10XHT104,61,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
7,JN10XHT105,37,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
8,JN10XHT106,69,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
9,JN10XHT107,23,,Yale,,,,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown


In [27]:
overlap = set(obs['sample_ID']) - set(adata.obs['sample_ID'])
overlap_other_side = set(adata.obs['sample_ID']) - set(obs['sample_ID'])

In [28]:
overlap

set()

In [29]:
overlap_other_side

{'JN10XHT128', 'JN10XHT15'}

In [30]:
# remove overlap_other_side from adata (== 21 cells, so probably low quality samples)
adata = adata[~adata.obs['sample_ID'].isin(overlap_other_side)].copy()
adata

AnnData object with n_obs × n_vars = 1529919 × 36601
    obs: 'author_cell_type', 'sample_ID'

# Validate obs and uns from adata

In [31]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 1529919 × 36601
    obs: 'author_cell_type', 'sample_ID'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'

In [34]:
# Merge obs metadata
pd.set_option('display.max_rows', 100)
merged_df = pd.merge(adata.obs, obs, on='sample_ID', how='left')
merged_df.index = adata.obs.index
merged_df

Unnamed: 0,author_cell_type,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
JN10XHT119_HHT_TCCTTCTCACTGATTG-1,Fibroblast_Adventitial,JN10XHT119,43,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT123_HHT_AGAACCTTCCTTCTGG-1,Fibroblast_Adventitial,JN10XHT123,91,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT129_HHT_AGACTCACATGTGGCC-1,Fibroblast_Adventitial,JN10XHT129,60,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT129_HHT_TTCGGTCAGACTTCAC-1,Fibroblast_Adventitial,JN10XHT129,60,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT137_HHT_CCGGTAGCAGTCTACA-1,Fibroblast_Adventitial,JN10XHT137,57,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
JN10XHT37_HHT_CCTGCATCATATAGCC-1,PNEC,JN10XHT37,19,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT37_HHT_CGGGTCAGTGTTTACG-1,PNEC,JN10XHT37,19,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT37_HHT_TCATGAGAGTGCGTCC-1,PNEC,JN10XHT37,19,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown
JN10XHT79_HHT_TACCGGGAGAGCCATG-1,PNEC,JN10XHT79,21,,Yale,,,,,,...,3 prime tag,EFO:0008637,true,GRCh38,v98,CellRanger 7.1,yes,PATO:0000461,unknown,unknown


In [35]:
adata.obs = merged_df

# Add author cell type markers to UNS

In [None]:
# no cell type markers provided -> will be done by CAP

# Check author cell type annotations and Cell Ontology IDs

In [36]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False).index.tolist()

['AT2',
 'AT1',
 'AM',
 'Secretory',
 'Fibroblast_Alveolar',
 'Ciliated',
 'Monocyte',
 'IM',
 'T',
 'Goblet',
 'gCap',
 'Aerocyte',
 'Lymphatic',
 'Fibroblast_Adventitial',
 'Mac_div',
 'ABC',
 'Mesothelial',
 'SMC',
 'Artery',
 'Basal',
 'Fibroblast_PB',
 'NK',
 'Fibroblast_CTHRC1',
 'Vein',
 'B_plasma',
 'Systemic',
 'DC',
 'Pericyte',
 'Mast',
 'PNEC',
 'Fibroblast_FRC',
 'Endo_div',
 'Tuft']

In [37]:
cell_ontology_dict = {
    'AT2': 'CL:0002063',
    'AT1': 'CL:0002062',
    'AM': 'CL:0000583',
    'Secretory': 'CL:0000151',
    'Fibroblast_Alveolar': '',
    'Ciliated': 'CL:0000064',
    'Monocyte': 'CL:0000576',
    'IM': 'CL:4033043',
    'T': 'CL:0000084',
    'Goblet': 'CL:0000160',
    'gCap': 'CL:4028002',
    'Aerocyte': 'NaN', #NaN
    'Lymphatic': 'NaN', #NaN
    'Fibroblast_Adventitial': 'CL:0000057', #fibroblast
    'Mac_div': 'CL:0000235', # macrophage
    'ABC': 'CL:0000236', # B cell
    'Mesothelial': 'CL:0000077',
    'SMC': 'CL:0000192',
    'Artery': 'NaN', #NaN
    'Basal': 'CL:0000646',
    'Fibroblast_PB': 'CL:0000057', #fibroblast
    'NK': 'CL:0000623',
    'Fibroblast_CTHRC1': 'CL:0000057', #fibroblast
    'Vein': 'NaN', #NaN
    'B_plasma': 'CL:0000786',
    'Systemic': 'NaN',
    'DC': 'CL:0000451',
    'Pericyte': 'CL:0000669',
    'Mast': 'CL:0000097',
    'PNEC': 'CL:1000223',
    'Fibroblast_FRC': 'CL:0000057', #fibroblast
    'Endo_div': 'CL:0000115',
    'Tuft': 'CL:0002204' # brush cell
}

In [38]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[AUTHOR_CELL_TYPE].map(cell_ontology_dict)

adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063    596855
CL:0002062    540665
CL:0000583    153740
CL:0000151     48914
               38338
CL:0000064     31003
NaN            22511
CL:0000576     16477
CL:4033043     12557
CL:0000057     11090
CL:0000084     10754
CL:0000160      9800
CL:4028002      9131
CL:0000235      5616
CL:0000236      4155
CL:0000077      3940
CL:0000192      3545
CL:0000646      2584
CL:0000623      2313
CL:0000786      1624
CL:0000451      1342
CL:0000669      1249
CL:0000097       786
CL:1000223       711
CL:0000115       127
CL:0002204        92
Name: cell_type_ontology_term_id, dtype: int64

# Check whether ENSEMBL IDs in var

In [39]:
pd.set_option('display.max_rows', 100)
adata.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC141272.1
AC023491.2
AC007325.1
AC007325.4
AC007325.2


In [40]:
adata.var['gene_symbol'] = adata.var.index
adata.var

Unnamed: 0,gene_symbol
MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A
OR4F5,OR4F5
AL627309.1,AL627309.1
AL627309.3,AL627309.3
...,...
AC141272.1,AC141272.1
AC023491.2,AC023491.2
AC007325.1,AC007325.1
AC007325.4,AC007325.4


In [41]:
ensembl_mapping = pd.read_csv("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kaminski_unpubl/gene_symbols_Sauler_mapped.csv")
ensembl_mapping

Unnamed: 0.1,Unnamed: 0,hgnc_symbol,ensembl_gene_id
0,1,A3GALT2,ENSG00000184389
1,2,AADACL3,ENSG00000188984
2,3,AADACL4,ENSG00000204518
3,4,AAK1,ENSG00000115977
4,5,ABCA4,ENSG00000198691
...,...,...,...
27096,27097,ZSWIM1,ENSG00000168612
27097,27098,ZSWIM3,ENSG00000132801
27098,27099,ZSWIM9,ENSG00000185453
27099,27100,ZXDA,ENSG00000198205


In [42]:
ensembl_mapping_dict = dict(zip(ensembl_mapping['hgnc_symbol'], ensembl_mapping['ensembl_gene_id']))

adata.var['ensembl_id'] = adata.var['gene_symbol'].map(ensembl_mapping_dict)

In [43]:
adata.var['ensembl_id'].value_counts(dropna=False)

NaN                12570
ENSG00000243485        1
ENSG00000258955        1
ENSG00000258694        1
ENSG00000087303        1
                   ...  
ENSG00000137161        1
ENSG00000171611        1
ENSG00000221821        1
ENSG00000146223        1
ENSG00000274847        1
Name: ensembl_id, Length: 24032, dtype: int64

In [44]:
adata.var.index.name = 'index'

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-2HG,MIR1302-2HG,ENSG00000243485
FAM138A,FAM138A,ENSG00000237613
OR4F5,OR4F5,ENSG00000186092
AL627309.1,AL627309.1,
AL627309.3,AL627309.3,
...,...,...
AC141272.1,AC141272.1,
AC023491.2,AC023491.2,
AC007325.1,AC007325.1,
AC007325.4,AC007325.4,


# Check raw data

In [3]:
adata.X.toarray().max()

: 

: 

In [45]:
adata.X = adata.X.astype(np.int64)

In [46]:
adata.raw = adata

In [47]:
adata.X

<1529919x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 2616853086 stored elements in Compressed Sparse Row format>

In [48]:
adata.raw.X

<1529919x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 2616853086 stored elements in Compressed Sparse Row format>

In [None]:
adata.X.toarray().max()

In [None]:
adata.raw.X.toarray().max()

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet

# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell ontology IDs mapped ourselves
- CHECK: Author cell type in OBS
- MISSING: Marker genes not provided -> CAP
- CHECK: ENSEMBL IDs mapped ourselves -> 12570 could not me mapped using BiomaRt

# Revisions:

DONE

In [49]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

: 

In [3]:
adata = sc.read_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/Sauler_unpubl.h5ad")

In [4]:
# split adata into two datasets
adata_chunk_1 = adata[:adata.shape[0]//2].copy()
adata_chunk_2 = adata[adata.shape[0]//2:].copy()

In [5]:
adata_chunk_1

AnnData object with n_obs × n_vars = 764959 × 36601
    obs: 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'author_cell_type', 'cell_type_ontology_term_id', 'author_cell_

In [6]:
adata_chunk_2

AnnData object with n_obs × n_vars = 764960 × 36601
    obs: 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'author_cell_type', 'cell_type_ontology_term_id', 'author_cell_

In [7]:
adata_chunk_1.write_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/Sauler_unpubl_1.h5ad")

In [None]:
adata_chunk_2.write_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/Sauler_unpubl_2.h5ad")

In [3]:
adata_1 = sc.read_h5ad("/ictstr01/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision_reduced/Sauler_unpubl_1.h5ad")