In [34]:
import pertpy as pt
import scanpy as sc

In [35]:
adata = pt.dt.dialogue_example() 
adata

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset'
    var: 'name'

In [36]:
adata.obs['cell_line_name'] = 'MCF7' # not DepMap_ID, otherwise it will be a little bit confusing when we annotate_cell_line. See below

In [37]:
pt_metadata = pt.tl.CellLineMetaData()

In [38]:
pt_metadata.annotate_cell_lines(adata=adata, reference_id='cell_line_name', query_id='cell_line_name')
# now we have DepMap_ID in adata.obs after annotation, great!
# However, if we have `adata.obs['DepMap_ID'] = 'MCF7'`
## Case 1: use default reference_id DepMap_id for mapping, we can not find matched cell line ids, because it is a name indeed
## Case 2: use reference_id='cell_line_name', but since there is already a column called DepMap_ID, 
## the function doesnt overwrite the existing columns in adata.obs. So we miss the true information of DepMap_Id
## I hope I explain it clearly!

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset', 'cell_line_name', 'DepMap_ID', 'stripped_cell_line_name', 'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID', 'WTSI_Master_Cell_ID', 'sample_collection_site', 'primary_or_metastasis', 'primary_disease', 'Subtype', 'age', 'Sanger_Model_ID', 'depmap_public_comments', 'lineage', 'lineage_subtype', 'lineage_sub_subtype', 'lineage_molecular_subtype', 'default_growth_pattern', 'model_manipulation', 'model_manipulation_details', 'patient_id', 'parent_depmap_id', 'Cellosaurus_NCIt_disease', 'Cellosaurus_NCIt_id', 'Cellosaurus_issues'
    var: 'name'

In [39]:
adata.obs[['primary_or_metastasis']]

Unnamed: 0_level_0,primary_or_metastasis
index,Unnamed: 1_level_1
N7.EpiA.AAACGCACAATCGC,Metastasis
N7.EpiA.AGATATTGATCGGT,Metastasis
N7.EpiA.AGTCTACTTCTCTA,Metastasis
N7.EpiA.ATATACGAAGTACC,Metastasis
N7.EpiA.ATCTGTTGTCATTC,Metastasis
...,...
N110.LPB.TCTTCGGTCACGCATA,Metastasis
N110.LPB.TTAGGCACAATCCAAC,Metastasis
N110.LPB.TTGAACGTCGTACGGC,Metastasis
N110.LPB.TTGGCAATCCTCCTAG,Metastasis


In [40]:
pt_metadata.annotate_cell_lines(adata=adata, query_id='cell_line_name', reference_id="cell_line_name", cell_line_source='Cancerrxgene')

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset', 'cell_line_name', 'DepMap_ID', 'stripped_cell_line_name', 'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID', 'WTSI_Master_Cell_ID', 'sample_collection_site', 'primary_or_metastasis', 'primary_disease', 'Subtype', 'age', 'Sanger_Model_ID', 'depmap_public_comments', 'lineage', 'lineage_subtype', 'lineage_sub_subtype', 'lineage_molecular_subtype', 'default_growth_pattern', 'model_manipulation', 'model_manipulation_details', 'patient_id', 'parent_depmap_id', 'Cellosaurus_NCIt_disease', 'Cellosaurus_NCIt_id', 'Cellosaurus_issues', 'TCGA Classfication', 'Model ID', 'Tissue', 'COSMIC ID', 'Tissue sub-type', 'GDSC1', 'GDSC2'
    var: 'name'

In [41]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,cellQ,gender,location,clinical.status,cell.subtypes,pathology,origin,subset,...,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues,TCGA Classfication,Model ID,Tissue,COSMIC ID,Tissue sub-type,GDSC1,GDSC2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N7.EpiA.AAACGCACAATCGC,2176.911552,269,0.037467,Female,Epi,Non-inflamed,TA2,True,A,A,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N7.EpiA.AGATATTGATCGGT,4319.159178,660,0.093293,Female,Epi,Non-inflamed,TA2,True,A,A,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N7.EpiA.AGTCTACTTCTCTA,7230.356204,1543,0.242912,Female,Epi,Non-inflamed,TA2,True,A,A,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N7.EpiA.ATATACGAAGTACC,5868.425665,1051,0.155489,Female,Epi,Non-inflamed,TA2,True,A,A,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N7.EpiA.ATCTGTTGTCATTC,3130.076031,432,0.058324,Female,Epi,Non-inflamed,TA2,True,A,A,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPB.TCTTCGGTCACGCATA,3346.150450,492,0.198183,Female,LP,Inflamed,CD8+ IELs,True,,C,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N110.LPB.TTAGGCACAATCCAAC,4341.610208,672,0.268843,Female,LP,Inflamed,CD8+ LP,True,,C,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N110.LPB.TTGAACGTCGTACGGC,4410.712825,706,0.289367,Female,LP,Inflamed,CD8+ LP,True,,C,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0
N110.LPB.TTGGCAATCCTCCTAG,2755.624716,373,0.149731,Female,LP,Inflamed,CD8+ LP,True,,C,...,Invasive breast carcinoma of no special type,C4194,,BRCA,SIDM00148,breast,905946,breast,345.0,285.0


In [42]:
## ccle expression annotation only accept DepMap_ID as cell line id
# which we have after calling annotate_cell_line, yeah
pt_metadata.annotate_ccle_expression(adata)

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset', 'cell_line_name', 'DepMap_ID', 'stripped_cell_line_name', 'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID', 'WTSI_Master_Cell_ID', 'sample_collection_site', 'primary_or_metastasis', 'primary_disease', 'Subtype', 'age', 'Sanger_Model_ID', 'depmap_public_comments', 'lineage', 'lineage_subtype', 'lineage_sub_subtype', 'lineage_molecular_subtype', 'default_growth_pattern', 'model_manipulation', 'model_manipulation_details', 'patient_id', 'parent_depmap_id', 'Cellosaurus_NCIt_disease', 'Cellosaurus_NCIt_id', 'Cellosaurus_issues', 'TCGA Classfication', 'Model ID', 'Tissue', 'COSMIC ID', 'Tissue sub-type', 'GDSC1', 'GDSC2'
    var: 'name'
    obsm: 'CCLE_expression'

In [43]:
adata.obsm['CCLE_expression']

Unnamed: 0_level_0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N7.EpiA.AAACGCACAATCGC,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N7.EpiA.AGATATTGATCGGT,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N7.EpiA.AGTCTACTTCTCTA,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N7.EpiA.ATATACGAAGTACC,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N7.EpiA.ATCTGTTGTCATTC,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPB.TCTTCGGTCACGCATA,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N110.LPB.TTAGGCACAATCCAAC,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N110.LPB.TTGAACGTCGTACGGC,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
N110.LPB.TTGGCAATCCTCCTAG,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,0.056584,6.759688,5.029453,4.324811,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0


In [44]:
pt_metadata.annotate_protein_expression(adata, query_id='DepMap_ID', reference_id="model_id")
# well, we still have chance...

ValueError: ('All the identifiers present in adata.obs could not be found in the protein expression file, ', 'Stop annotating protein expression metadata. Please check it again.')

In [45]:
pt_metadata.annotate_protein_expression(adata, query_id='cell_line_name') # default reference_id: model_name
adata.obsm['proteomics_protein_intensity']

uniprot_id,A0A075B6K4,A0A075B6N1,A0A075B7B8,A0A075B7D8,A0A087X0M5,A0A0B4J1V0,A0A0B4J2F0,A0A0C4DH29,A0A0C4DH31,A0A0U1RRE5,...,Q9Y6V0,Q9Y6V7,Q9Y6W3,Q9Y6W5,Q9Y6X4,Q9Y6X5,Q9Y6X8,Q9Y6X9,Q9Y6Y0,Q9Y6Y8
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N7.EpiA.AAACGCACAATCGC,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N7.EpiA.AGATATTGATCGGT,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N7.EpiA.AGTCTACTTCTCTA,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N7.EpiA.ATATACGAAGTACC,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N7.EpiA.ATCTGTTGTCATTC,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPB.TCTTCGGTCACGCATA,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N110.LPB.TTAGGCACAATCCAAC,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N110.LPB.TTGAACGTCGTACGGC,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991
N110.LPB.TTGGCAATCCTCCTAG,,,,,,,2.19148,,,,...,3.56099,,5.34285,3.56077,,,3.63192,3.28495,,2.38991


In [46]:
pt_metadata.annotate_bulk_rna_expression(adata, query_id='cell_line_name') # default reference_id: model_name

AnnData object with n_obs × n_vars = 5374 × 6329
    obs: 'nCount_RNA', 'nFeature_RNA', 'cellQ', 'gender', 'location', 'clinical.status', 'cell.subtypes', 'pathology', 'origin', 'subset', 'cell_line_name', 'DepMap_ID', 'stripped_cell_line_name', 'CCLE_Name', 'alias', 'COSMICID', 'sex', 'source', 'RRID', 'WTSI_Master_Cell_ID', 'sample_collection_site', 'primary_or_metastasis', 'primary_disease', 'Subtype', 'age', 'Sanger_Model_ID', 'depmap_public_comments', 'lineage', 'lineage_subtype', 'lineage_sub_subtype', 'lineage_molecular_subtype', 'default_growth_pattern', 'model_manipulation', 'model_manipulation_details', 'patient_id', 'parent_depmap_id', 'Cellosaurus_NCIt_disease', 'Cellosaurus_NCIt_id', 'Cellosaurus_issues', 'TCGA Classfication', 'Model ID', 'Tissue', 'COSMIC ID', 'Tissue sub-type', 'GDSC1', 'GDSC2'
    var: 'name'
    obsm: 'CCLE_expression', 'proteomics_protein_intensity', 'bulk_rna_expression_broad'

In [47]:
adata.obsm['bulk_rna_expression_broad']

gene_id,SIDG00001,SIDG00002,SIDG00003,SIDG00004,SIDG00005,SIDG00006,SIDG00007,SIDG00008,SIDG00009,SIDG00010,...,SIDG42469,SIDG42470,SIDG42471,SIDG42472,SIDG42473,SIDG42474,SIDG42475,SIDG42479,SIDG42480,SIDG42481
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N7.EpiA.AAACGCACAATCGC,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N7.EpiA.AGATATTGATCGGT,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N7.EpiA.AGTCTACTTCTCTA,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N7.EpiA.ATATACGAAGTACC,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N7.EpiA.ATCTGTTGTCATTC,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPB.TCTTCGGTCACGCATA,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N110.LPB.TTAGGCACAATCCAAC,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N110.LPB.TTGAACGTCGTACGGC,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39
N110.LPB.TTGGCAATCCTCCTAG,34,280,8,32,45,19,0,30,20,1,...,964,0,1099,2461,0,9228,2832,725,232,39


In [None]:
#TODO
viability = pt_metadata.annotate_from_GDSC()

In [None]:
arc_witch = LogisticRegression()
arc_witch.train(adata.X, viability)

Some checking of error messages:

In [48]:
pt_metadata.annotate_bulk_rna_expression(adata, query_id='DepMap_ID', cell_line_source="sanger")
# MCF7 is not in the metadata
# I checked 

ValueError: ('All the identifiers present in adata.obs could not be found in the bulk RNA expression data, ', 'Stop annotating bulk RNA expression data. Please check it again.')

In [None]:
pt_metadata.annotate_bulk_rna_expression(adata, query_id='cell_line_name', cell_line_source="sanger")


In [None]:
pt_metadata.driver_gene_intOGen