In [1]:
import pertpy as pt
import scanpy as sc
import pandas as pd

Global seed set to 0


This notebook is mainly used to discore the CellLineMetaData.lookup function in pertpy. In general, the lookup object with following functions:
- Summarize the metadata in the database, e.g. the number of cell lines, the number of genes/proteins measured
- Give an overview of the possible reference_id (cell line identifiers in the metadata)
- Given a list of unique query_id (cell line identifiers in the adata.obs) for the cell, return the number of matched identifiers in the metadata

In [2]:
# here we use two adata as example
adata_dialogue = pt.dt.dialogue_example() 
adata_dialogue

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset'
    var: 'name'

In [3]:
adata_dialogue.obs['cell_line_name'] = 'MCF7'

In [4]:
adata_mc = pt.dt.mcfarland_2020()
adata_mc

AnnData object with n_obs × n_vars = 182875 × 32738
    obs: 'DepMap_ID', 'cancer', 'cell_det_rate', 'cell_line', 'cell_quality', 'channel', 'disease', 'dose_unit', 'dose_value', 'doublet_CL1', 'doublet_CL2', 'doublet_GMM_prob', 'doublet_dev_imp', 'doublet_z_margin', 'hash_assignment', 'hash_tag', 'num_SNPs', 'organism', 'percent.mito', 'perturbation', 'perturbation_type', 'sex', 'singlet_ID', 'singlet_dev', 'singlet_dev_z', 'singlet_margin', 'singlet_z_margin', 'time', 'tissue_type', 'tot_reads', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [5]:
# create the metadata object
pt_metadata = pt.tl.CellLineMetaData()

In [6]:
# In order to annotate cell line metadata, we can create a LookUp object specific for CellLineMetaData by calling lookup()
# During initialisation, the object generates a namedtuple for each type of metadata.
# It summarizes the available meta data for the specified source and the available reference id which we can use in the meta data
cl_lookup = pt_metadata.lookup()

In [7]:
# For cell line annotation
cl_lookup.cell_lines

cell_lines(depmap=cell_line_annotation(n_cell_line=1840, cell_line=array(['ACH-000016', 'ACH-000032', 'ACH-000033', ..., 'ACH-002395',
       'ACH-002396', 'ACH-002397'], dtype=object), n_metadata=29, metadata=array(['DepMap_ID', 'cell_line_name', 'stripped_cell_line_name',
       'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID',
       'WTSI_Master_Cell_ID', 'sample_collection_site',
       'primary_or_metastasis', 'primary_disease', 'Subtype', 'age',
       'Sanger_Model_ID', 'depmap_public_comments', 'lineage',
       'lineage_subtype', 'lineage_sub_subtype',
       'lineage_molecular_subtype', 'default_growth_pattern',
       'model_manipulation', 'model_manipulation_details', 'patient_id',
       'parent_depmap_id', 'Cellosaurus_NCIt_disease',
       'Cellosaurus_NCIt_id', 'Cellosaurus_issues'], dtype=object), reference_id=['DepMap_ID', 'cell_line_name', 'stripped_cell_line_name', 'CCLE_Name'], reference_id_example='DepMap_ID: ACH-000016 | cell_line_name: SLR 21 | strippe

In [8]:
# Since there are two different cell lines, cl_lookup.cell_lines contains both depmap and cancerrxgene
cl_lookup.cell_lines.depmap

cell_line_annotation(n_cell_line=1840, cell_line=array(['ACH-000016', 'ACH-000032', 'ACH-000033', ..., 'ACH-002395',
       'ACH-002396', 'ACH-002397'], dtype=object), n_metadata=29, metadata=array(['DepMap_ID', 'cell_line_name', 'stripped_cell_line_name',
       'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID',
       'WTSI_Master_Cell_ID', 'sample_collection_site',
       'primary_or_metastasis', 'primary_disease', 'Subtype', 'age',
       'Sanger_Model_ID', 'depmap_public_comments', 'lineage',
       'lineage_subtype', 'lineage_sub_subtype',
       'lineage_molecular_subtype', 'default_growth_pattern',
       'model_manipulation', 'model_manipulation_details', 'patient_id',
       'parent_depmap_id', 'Cellosaurus_NCIt_disease',
       'Cellosaurus_NCIt_id', 'Cellosaurus_issues'], dtype=object), reference_id=['DepMap_ID', 'cell_line_name', 'stripped_cell_line_name', 'CCLE_Name'], reference_id_example='DepMap_ID: ACH-000016 | cell_line_name: SLR 21 | stripped_cell_line_name: 

In [9]:
# use dot accessor 
cl_lookup.cell_lines.depmap.n_cell_line

1840

In [10]:
cl_lookup.cell_lines.depmap.n_metadata

29

In [11]:
cl_lookup.cell_lines.cancerrxgene.n_cell_line

978

In [12]:
cl_lookup.cell_lines.cancerrxgene.reference_id

['cell_line_name', 'stripped_cell_line_name', 'Model ID', 'COSMIC ID']

In [13]:
cl_lookup.cell_lines.cancerrxgene.reference_id_example

'cell_line_name: SNU-283 | stripped_cell_line_name: SNU283 | Model ID: SIDM00215 | COSMIC ID: 1659929'

In [14]:
cl_lookup.cell_lines.depmap.default_parameter

{'cell_line_source': 'DepMap',
 'query_id': 'DepMap_ID',
 'reference_id': 'DepMap_ID',
 'cell_line_information': 'None'}

In [15]:
# As default, the function is named after available_{metadata}. 
# you can also give a list of unique query ids to test how many cell lines are matched in the metad data
# Sometimes the result is different depending on the identifier and the source of cell line annotation you choose
cl_lookup.available_cell_lines(query_id_list=adata_dialogue.obs['cell_line_name'].unique(), reference_id="cell_line_name") # Default reference_id is DepMap_ID

In [16]:
cl_lookup.available_cell_lines(query_id_list=adata_mc.obs['DepMap_ID'].unique())

In [17]:
# DepMap_ID is not available for Cancerrxgene, stripped_cell_line_name is used as default
cl_lookup.available_cell_lines(query_id_list=adata_mc.obs['DepMap_ID'].unique(), cell_line_source="Cancerrxgene")

In [18]:
# We can do the same for the bulk RNA expression annotation, which contains both Broad and Sanger cell line
cl_lookup.bulk_rna

bulk_rna_expression(broad=bulk_rna_annotation(n_cell_line=1406, cell_line=array(['ACH-001113', 'ACH-001289', 'ACH-001339', ..., 'ACH-001858',
       'ACH-001997', 'ACH-000052'], dtype=object), n_gene=53970, gene=array(['TSPAN6 (ENSG00000000003)', 'TNMD (ENSG00000000005)',
       'DPM1 (ENSG00000000419)', ..., 'ENSG00000288723',
       'ENSG00000288724', 'ENSG00000288725'], dtype=object), reference_id='DepMap_ID', reference_id_example='DepMap_ID: ACH-001113', default_parameter={'query_id': 'DepMap_ID', 'cell_line_source': 'broad'}), sanger=bulk_rna_annotation(n_cell_line=1431, cell_line=array(['MEC-1', 'NBsusSR', 'M14', ..., 'HCM-SANG-1095-C25',
       'HCM-SANG-1336-C15', 'HCM-SANG-1308-C25'], dtype=object), n_gene=37602, gene=array(['A1BG', 'A1BG-AS1', 'A1CF', ..., 'RSKR', 'SHLD3', 'ATP6V1FNB'],
      dtype=object), reference_id='model_name', reference_id_example='model_name: MEC-1', default_parameter={'query_id': 'cell_line_name', 'cell_line_source': 'sanger'}))

In [19]:
cl_lookup.bulk_rna.broad.n_cell_line

1406

In [20]:
cl_lookup.bulk_rna.sanger.n_cell_line

1431

In [21]:
cl_lookup.available_bulk_rna_expression(cell_line_source = "broad", query_id_list = adata_dialogue.obs['cell_line_name'].unique())

In [22]:
cl_lookup.available_bulk_rna_expression(cell_line_source = "sanger",
                                       query_id_list = adata_dialogue.obs['cell_line_name'].unique())

In [23]:
# For another dataset
# If we can not find matched cell lines in the meta data, we can try to annotate cell line metadata though annotate_cell_line to see whether we can get more possible cell line identifiers
# In this notebook we only focus on lookup function, so we dont call annotate_cell_line
cl_lookup.available_bulk_rna_expression(cell_line_source = "broad", query_id_list = adata_mc.obs['DepMap_ID'].unique())

In [24]:
cl_lookup.proteomics

proteomics(n_cell_line=948, cell_line=array(['SK-GT-4', 'JM1', 'GR-ST', 'HeLa', 'CML-T1', 'SW954', 'LB771-HNC',
       'huH-1', 'NCCIT', 'Raji', 'KMS-12-BM', 'STS-0421', 'MZ2-MEL',
       'NCI-H1435', 'NCI-H1573', 'RF-48', 'U-266', 'MOLT-16', 'DND-41',
       'FTC-133', 'PCI-4B', 'HSC-39', 'HLE', 'SK-MEL-2', 'SF539',
       'NCI-H2722', 'NCI-H2591', 'UACC-62', 'UACC-257', 'NCI-H522',
       '786-0', 'JHU-011', 'HOP-92', 'HT-29', 'NCI-H3122', 'NCI-H226',
       'COLO-792', 'COLO-684', 'NCI-H460', 'MCF7', 'CP50-MEL-B',
       'LB373-MEL-D', 'LB1047-RCC', 'CP66-MEL', 'EW-16', 'EW-12',
       'D-542MG', 'ALL-PO', 'CAS-1', 'SNU-407', 'OC-314', 'GI-ME-N',
       'OCUB-M', 'NOS-1', 'NEC8', 'NB69', 'MS-1', 'NB5', 'ES8',
       'TGBC24TKB', 'MDA-MB-453', 'EoL-1-cell', 'CTB-1', 'HMV-II',
       'LU-165', 'JHOS-4', 'KU812', 'L-363', 'VMRC-RCZ', 'VMRC-LCD',
       'MFE-296', 'MFE-280', 'MEL-JUSO', 'ME-1', 'LAMA-84', 'SAT', 'SAS',
       'QGP-1', 'SKN-3', 'SKG-IIIa', 'SCH', 'SK-MEL-30', 'SK-GT-2',


In [25]:
cl_lookup.available_protein_expression(query_id_list = adata_dialogue.obs['cell_line_name'].unique())

In [26]:
cl_lookup.available_protein_expression(query_id_list = adata_mc.obs['DepMap_ID'].unique(),
                                    reference_id = "model_id")

In [27]:
cl_lookup.drug_response.gdsc1

drug_response_annotation(n_cell_line=970, cell_line=array(['22RV1', '23132-87', '42-MG-BA', '451Lu', '5637', '639-V', '647-V',
       '697', '769-P', '786-0', '8-MG-BA', '8305C', '8505C', 'A101D',
       'A172', 'A204', 'A2058', 'A253', 'A2780', 'A3-KAW', 'A375', 'A388',
       'A4-Fuk', 'A427', 'A431', 'A498', 'A549', 'A673', 'A704', 'ABC-1',
       'ACHN', 'AGS', 'ALL-PO', 'ALL-SIL', 'AM-38', 'AMO-1', 'AN3-CA',
       'ARH-77', 'ASH-3', 'ATN-1', 'AU565', 'AsPC-1', 'B-CPAP', 'BALL-1',
       'BB30-HNC', 'BB49-HNC', 'BB65-RCC', 'BC-1', 'BC-3', 'BE-13',
       'BE2-M17', 'BEN', 'BFTC-905', 'BFTC-909', 'BHT-101', 'BHY',
       'BICR10', 'BICR22', 'BICR31', 'BICR78', 'BL-41', 'BPH-1', 'BT-20',
       'BT-474', 'BT-483', 'BT-549', 'BV-173', 'Becker', 'BxPC-3',
       'C-33-A', 'C-4-I', 'C2BBe1', 'C32', 'C3A', 'CA46', 'CADO-ES1',
       'CAKI-1', 'CAL-120', 'CAL-12T', 'CAL-148', 'CAL-27', 'CAL-29',
       'CAL-33', 'CAL-39', 'CAL-51', 'CAL-54', 'CAL-62', 'CAL-72',
       'CAL-78', 'CAL-85-1

In [28]:
cl_lookup.drug_response.gdsc2

drug_response_annotation(n_cell_line=969, cell_line=array(['22RV1', '23132-87', '42-MG-BA', '451Lu', '5637', '639-V', '647-V',
       '697', '769-P', '786-0', '8-MG-BA', '8305C', '8505C', 'A101D',
       'A172', 'A204', 'A2058', 'A253', 'A2780', 'A3-KAW', 'A375', 'A388',
       'A4-Fuk', 'A427', 'A431', 'A498', 'A549', 'A673', 'A704', 'ABC-1',
       'ACHN', 'AGS', 'ALL-PO', 'ALL-SIL', 'AM-38', 'AMO-1', 'AN3-CA',
       'ARH-77', 'ASH-3', 'ATN-1', 'AU565', 'AsPC-1', 'B-CPAP', 'BALL-1',
       'BB30-HNC', 'BB49-HNC', 'BB65-RCC', 'BC-1', 'BC-3', 'BE-13',
       'BE2-M17', 'BEN', 'BFTC-905', 'BFTC-909', 'BHT-101', 'BHY',
       'BICR10', 'BICR22', 'BICR31', 'BICR78', 'BL-41', 'BONNA-12',
       'BPH-1', 'BT-20', 'BT-474', 'BT-483', 'BT-549', 'BV-173', 'Becker',
       'BxPC-3', 'C-33-A', 'C-4-I', 'C2BBe1', 'C32', 'C3A', 'CA46',
       'CADO-ES1', 'CAKI-1', 'CAL-120', 'CAL-12T', 'CAL-148', 'CAL-27',
       'CAL-29', 'CAL-33', 'CAL-39', 'CAL-51', 'CAL-54', 'CAL-62',
       'CAL-72', 'CAL-78

In [29]:
cl_lookup.available_drug_response(query_id_list=adata_dialogue.obs['cell_line_name'].unique())