Merge all fetched datasets originating from Monarch Initiative, TTD and DrugCentral.

Adapted by Rosa Zwart from Jupyter Notebooks found at https://github.com/PPerdomoQ/rare-disease-explainer

In [1]:
import pandas as pd
import numpy as np

import util.constants as constants

#### **Format Monarch data**

In [2]:
monarch_edges = pd.read_csv('prev/monarch/monarch_connections_v2023-02-20.csv')
monarch_edges.drop_duplicates(inplace=True)
monarch_edges

Unnamed: 0,subject_id,subject_label,relation_id,relation_label,object_id,object_label,reference_id_list
0,MGI:1202300,Kcnu1,RO:0002200,has phenotype,MP:0002675,asthenozoospermia,PMID:21427226|PMID:20138882
1,WormBase:WBGene00003369,mlc-1,RO:0002434,interacts with,WormBase:WBGene00006585,tni-3,
2,ENSEMBL:ENSBTAG00000010660,CACNA1C,RO:HOM0000017,in orthology relationship with,SGD:S000003449,CCH1,
3,MGI:98783,Tnni3,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000039013,ENSEMBL:ENSOANG00000039013,
4,FlyBase:FBgn0038129,TBC1D5,RO:0002200,has phenotype,FBcv:0000349,viable,PMID:20371351|PMID:20220848|PMID:21074052|FlyB...
...,...,...,...,...,...,...,...
82669,ENSEMBL:ENSCAFG00000031099,STUB1,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSSSCG00000008008,ENSEMBL:ENSSSCG00000008008,
82670,HGNC:6487,LAMB2,RO:0002434,interacts with,HGNC:8590,PAK1,
82671,ENSEMBL:ENSCAFG00000018952,ARHGEF6,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000011671,ENSEMBL:ENSOANG00000011671,
82672,HGNC:329,AGRN,RO:0002434,interacts with,HGNC:6487,LAMB2,


In [3]:
monarch_nodes = pd.read_csv('prev/monarch/monarch_nodes_v2023-02-20.csv')
monarch_nodes.drop_duplicates(inplace=True)
monarch_nodes

Unnamed: 0,id,semantic_groups,preflabel,synonyms,description,name
0,MGI:1202300,ORTH,Kcnu1,,,Kcnu1
1,MP:0002675,DISO,asthenozoospermia,,,asthenozoospermia
2,WormBase:WBGene00003369,ORTH,mlc-1,,,mlc-1
3,WormBase:WBGene00006585,ORTH,tni-3,,,tni-3
4,ENSEMBL:ENSBTAG00000010660,ORTH,CACNA1C,CACH2|CACN2|CACNL1A1|CCHL1A1|CaV1.2|LQT8|NEDHL...,This gene encodes an alpha-1 subunit of a volt...,calcium voltage-gated channel subunit alpha1 C
...,...,...,...,...,...,...
9836,ZP:0011545,DISO,"cell posterior lateral mesoderm shape, abnormal",,,"cell posterior lateral mesoderm shape, abnormal"
9837,ZP:0002302,DISO,"trunk undulate, abnormal",,,"trunk undulate, abnormal"
9838,ZP:0005847,DISO,photoreceptor outer segment photoreceptor cell...,,,photoreceptor outer segment photoreceptor cell...
9839,ZP:0006358,DISO,"swimming decreased speed, abnormal",,,"swimming decreased speed, abnormal"


In [4]:
for i, row in monarch_edges.iterrows():
    monarch_edges.loc[i,'id'] = f'MONARCH{i}'
    
    subject_id = row['subject_id']

    monarch_edges.loc[i,'subject_iri'] = np.nan
    monarch_edges.loc[i,'subject_category'] = monarch_nodes.loc[monarch_nodes['id'] == subject_id]['semantic_groups'].values[0]
    monarch_edges.loc[i,'subject_taxon_id'] = np.nan
    monarch_edges.loc[i,'subject_taxon_label'] = np.nan
    
    object_id = row['object_id']
    
    monarch_edges.loc[i,'object_iri'] = np.nan
    monarch_edges.loc[i,'object_category'] = monarch_nodes.loc[monarch_nodes['id'] == object_id]['semantic_groups'].values[0]
    monarch_edges.loc[i,'object_taxon_id'] = np.nan
    monarch_edges.loc[i,'object_taxon_label'] = np.nan
    
    monarch_edges.loc[i,'relation_iri'] = np.nan

In [5]:
monarch_edges

Unnamed: 0,subject_id,subject_label,relation_id,relation_label,object_id,object_label,reference_id_list,id,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_iri
0,MGI:1202300,Kcnu1,RO:0002200,has phenotype,MP:0002675,asthenozoospermia,PMID:21427226|PMID:20138882,MONARCH0,,ORTH,,,,DISO,,,
1,WormBase:WBGene00003369,mlc-1,RO:0002434,interacts with,WormBase:WBGene00006585,tni-3,,MONARCH1,,ORTH,,,,ORTH,,,
2,ENSEMBL:ENSBTAG00000010660,CACNA1C,RO:HOM0000017,in orthology relationship with,SGD:S000003449,CCH1,,MONARCH2,,ORTH,,,,ORTH,,,
3,MGI:98783,Tnni3,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000039013,ENSEMBL:ENSOANG00000039013,,MONARCH3,,ORTH,,,,ORTH,,,
4,FlyBase:FBgn0038129,TBC1D5,RO:0002200,has phenotype,FBcv:0000349,viable,PMID:20371351|PMID:20220848|PMID:21074052|FlyB...,MONARCH4,,ORTH,,,,DISO,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82669,ENSEMBL:ENSCAFG00000031099,STUB1,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSSSCG00000008008,ENSEMBL:ENSSSCG00000008008,,MONARCH82669,,ORTH,,,,ORTH,,,
82670,HGNC:6487,LAMB2,RO:0002434,interacts with,HGNC:8590,PAK1,,MONARCH82670,,GENE,,,,GENE,,,
82671,ENSEMBL:ENSCAFG00000018952,ARHGEF6,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000011671,ENSEMBL:ENSOANG00000011671,,MONARCH82671,,ORTH,,,,ORTH,,,
82672,HGNC:329,AGRN,RO:0002434,interacts with,HGNC:6487,LAMB2,,MONARCH82672,,GENE,,,,GENE,,,


In [6]:
monarch_associations_df = monarch_edges[list(constants.assoc_tuple_values)]
monarch_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_monarch_associations.csv', index=None)

#### **Format TTD data**

Save a list of all associations between DRUG and GENE in a csv file.

In [7]:
drug_targets = pd.read_csv('prev/drug.target.final.ELA.csv')
drug_targets

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
0,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1D|CACNA1C,CAC1D_HUMAN|CAC1C_HUMAN,,,,...,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin|Tclin|Tclin,Homo sapiens,HGNC:1391
1,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,Q01668,CACNA1D,CAC1D_HUMAN,8.40,,IC50,...,IUPHAR,=,1.0,IUPHAR,,https://www.ebi.ac.uk/chembl/compound/inspect/...,GATING INHIBITOR,Tclin,Homo sapiens,HGNC:1391
2,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,O60840,CACNA1F,CAC1F_HUMAN,6.00,,IC50,...,IUPHAR,~,,,,,GATING INHIBITOR,Tclin,Homo sapiens,HGNC:1393
3,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,P22002,Cacna1c,CAC1C_RAT,6.00,,IC50,...,IUPHAR,=,,,,,GATING INHIBITOR,,Rattus norvegicus,RGD:2245
4,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,Q02485,Cacna1s,CAC1S_RAT,6.00,,IC50,...,IUPHAR,=,,,,,GATING INHIBITOR,,Rattus norvegicus,RGD:70983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,tirbanibulin,5431,Proto-oncogene tyrosine-protein kinase Src,Kinase,P12931,SRC,SRC_HUMAN,4.34,,IC50,...,IUPHAR,=,,,,,INHIBITOR,Tclin,Homo sapiens,HGNC:11283
247,casimersen,5443,exon 53 of dystrophin pre-mRNA,RNA,P11532,DMD,DMD_HUMAN,,,,...,UNKNOWN,,1.0,DRUG LABEL,,https://www.accessdata.fda.gov/drugsatfda_docs...,ANTISENSE INHIBITOR,Tclin,Homo sapiens,HGNC:2928
248,infigratinib,5459,Proto-oncogene tyrosine-protein kinase Src,Kinase,P12931,SRC,SRC_HUMAN,5.53,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:11283
249,infigratinib,5459,Proto-oncogene tyrosine-protein kinase Src,Kinase,P12931,SRC,SRC_HUMAN,5.53,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:11283


Get columns that are relevant.

In [8]:
drug_targets_values = drug_targets.copy()[['STRUCT_ID', 'DRUG_NAME', 'NEW_ID']]
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID
0,5,(S)-nicardipine,HGNC:1391
1,6,(S)-nitrendipine,HGNC:1391
2,6,(S)-nitrendipine,HGNC:1393
3,6,(S)-nitrendipine,RGD:2245
4,6,(S)-nitrendipine,RGD:70983
...,...,...,...
246,5431,tirbanibulin,HGNC:11283
247,5443,casimersen,HGNC:2928
248,5459,infigratinib,HGNC:11283
249,5459,infigratinib,HGNC:11283


In [9]:
drug_targets_values.drop_duplicates(inplace=True)
drug_targets_values.shape[0]

239

Add new columns that are needed to be included in the dataframe.

In [10]:
for i, row in drug_targets_values.iterrows():
    drug_targets_values.loc[i,'id'] = f'TTD{i}'
    
    drug_targets_values.loc[i,'subject_id'] = str(row['STRUCT_ID'])
    drug_targets_values.loc[i,'subject_label'] = row['DRUG_NAME']
    drug_targets_values.loc[i,'subject_iri'] = np.nan
    drug_targets_values.loc[i,'subject_category'] = 'DRUG'
    drug_targets_values.loc[i,'subject_taxon_id'] = np.nan
    drug_targets_values.loc[i,'subject_taxon_label'] = np.nan
    
    drug_targets_values.loc[i,'object_id'] = row['NEW_ID']
    drug_targets_values.loc[i,'object_label'] = np.nan
    drug_targets_values.loc[i,'object_iri'] = np.nan
    drug_targets_values.loc[i,'object_category'] = np.nan
    drug_targets_values.loc[i,'object_taxon_id'] = np.nan
    drug_targets_values.loc[i,'object_taxon_label'] = np.nan
    
    drug_targets_values.loc[i,'relation_id'] = 'CustomRO:TTD'
    drug_targets_values.loc[i,'relation_label'] = 'targets'
    drug_targets_values.loc[i,'relation_iri'] = np.nan

In [11]:
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,5,(S)-nicardipine,HGNC:1391,TTD0,5,(S)-nicardipine,,DRUG,,,HGNC:1391,,,,,,CustomRO:TTD,targets,
1,6,(S)-nitrendipine,HGNC:1391,TTD1,6,(S)-nitrendipine,,DRUG,,,HGNC:1391,,,,,,CustomRO:TTD,targets,
2,6,(S)-nitrendipine,HGNC:1393,TTD2,6,(S)-nitrendipine,,DRUG,,,HGNC:1393,,,,,,CustomRO:TTD,targets,
3,6,(S)-nitrendipine,RGD:2245,TTD3,6,(S)-nitrendipine,,DRUG,,,RGD:2245,,,,,,CustomRO:TTD,targets,
4,6,(S)-nitrendipine,RGD:70983,TTD4,6,(S)-nitrendipine,,DRUG,,,RGD:70983,,,,,,CustomRO:TTD,targets,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,5406,oliceridine,HGNC:1390,TTD242,5406,oliceridine,,DRUG,,,HGNC:1390,,,,,,CustomRO:TTD,targets,
244,5408,viltolarsen,HGNC:2928,TTD244,5408,viltolarsen,,DRUG,,,HGNC:2928,,,,,,CustomRO:TTD,targets,
245,5431,tirbanibulin,HGNC:11283,TTD245,5431,tirbanibulin,,DRUG,,,HGNC:11283,,,,,,CustomRO:TTD,targets,
247,5443,casimersen,HGNC:2928,TTD247,5443,casimersen,,DRUG,,,HGNC:2928,,,,,,CustomRO:TTD,targets,


In [12]:
drugtarget_associations_df = drug_targets_values[list(constants.assoc_tuple_values)]
drugtarget_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_ttd_associations.csv', index=None)

#### **Format DrugCentral data**

In [13]:
drug_diseases = pd.read_csv('prev/drug_to_disease_final_v2_ELA.csv')
drug_diseases

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
0,975,4483,Carfilzomib,Small-cell lung cancer,Phase 1/2,small cell lung cancer,Small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030357,False,False,HP:0030357
1,2389,4225,Axitinib,Renal cell carcinoma,Approved,renal cell carcinoma,Renal cell carcinoma,http://purl.obolibrary.org/obo/HP_0005584,False,False,HP:0005584
2,2871,5431,Tirbanibulin,Prostate cancer,Phase 2,prostate cancer,Prostate cancer,http://purl.obolibrary.org/obo/HP_0012125,False,False,HP:0012125
3,6582,789,Decamethonium,Muscle spasm,Approved,muscle spasm,Hyperkinetic movements,http://purl.obolibrary.org/obo/HP_0002487,False,False,HP:0002487
4,9416,661,Cisatracurium,Muscle spasm,Approved,muscle spasm,Hyperkinetic movements,http://purl.obolibrary.org/obo/HP_0002487,False,False,HP:0002487
5,9732,749,Cyclizine,Nausea,Approved,nausea,Nausea,http://purl.obolibrary.org/obo/HP_0002018,False,False,HP:0002018
6,9800,183,Amlodipine,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
7,10669,461,Cadralazine,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
8,10829,2745,Trihexyphenidyl,Dystonia,Approved,dystonia,Dystonia,http://purl.obolibrary.org/obo/HP_0001332,False,False,HP:0001332
9,10831,2745,Trihexyphenidyl,Obesity,Investigative,obesity,Obesity,http://purl.obolibrary.org/obo/HP_0001513,False,False,HP:0001513


In [14]:
drug_diseases_values = drug_diseases.copy()[['DRUG_ID', 'ID']]
drug_diseases_values

Unnamed: 0,DRUG_ID,ID
0,4483,HP:0030357
1,4225,HP:0005584
2,5431,HP:0012125
3,789,HP:0002487
4,661,HP:0002487
5,749,HP:0002018
6,183,HP:0000822
7,461,HP:0000822
8,2745,HP:0001332
9,2745,HP:0001513


In [15]:
for i, row in drug_diseases_values.iterrows():
    drug_diseases_values.loc[i,'id'] = f'DC{i}'
    
    drug_diseases_values.loc[i,'subject_id'] = row['DRUG_ID']
    drug_diseases_values.loc[i,'subject_label'] = np.nan
    drug_diseases_values.loc[i,'subject_iri'] = np.nan
    drug_diseases_values.loc[i,'subject_category'] = 'DRUG'
    drug_diseases_values.loc[i,'subject_taxon_id'] = np.nan
    drug_diseases_values.loc[i,'subject_taxon_label'] = np.nan
    
    drug_diseases_values.loc[i,'object_id'] = row['ID']
    drug_diseases_values.loc[i,'object_label'] = np.nan
    drug_diseases_values.loc[i,'object_iri'] = np.nan
    drug_diseases_values.loc[i,'object_category'] = np.nan
    drug_diseases_values.loc[i,'object_taxon_id'] = np.nan
    drug_diseases_values.loc[i,'object_taxon_label'] = np.nan
    
    drug_diseases_values.loc[i,'relation_id'] = 'CustomRO:DC'
    drug_diseases_values.loc[i,'relation_label'] = 'is substance that treats'
    drug_diseases_values.loc[i,'relation_iri'] = np.nan

In [16]:
drug_diseases_values

Unnamed: 0,DRUG_ID,ID,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,4483,HP:0030357,DC0,4483.0,,,DRUG,,,HP:0030357,,,,,,CustomRO:DC,is substance that treats,
1,4225,HP:0005584,DC1,4225.0,,,DRUG,,,HP:0005584,,,,,,CustomRO:DC,is substance that treats,
2,5431,HP:0012125,DC2,5431.0,,,DRUG,,,HP:0012125,,,,,,CustomRO:DC,is substance that treats,
3,789,HP:0002487,DC3,789.0,,,DRUG,,,HP:0002487,,,,,,CustomRO:DC,is substance that treats,
4,661,HP:0002487,DC4,661.0,,,DRUG,,,HP:0002487,,,,,,CustomRO:DC,is substance that treats,
5,749,HP:0002018,DC5,749.0,,,DRUG,,,HP:0002018,,,,,,CustomRO:DC,is substance that treats,
6,183,HP:0000822,DC6,183.0,,,DRUG,,,HP:0000822,,,,,,CustomRO:DC,is substance that treats,
7,461,HP:0000822,DC7,461.0,,,DRUG,,,HP:0000822,,,,,,CustomRO:DC,is substance that treats,
8,2745,HP:0001332,DC8,2745.0,,,DRUG,,,HP:0001332,,,,,,CustomRO:DC,is substance that treats,
9,2745,HP:0001513,DC9,2745.0,,,DRUG,,,HP:0001513,,,,,,CustomRO:DC,is substance that treats,


In [17]:
drugdiseases_associations_df = drug_diseases_values[list(constants.assoc_tuple_values)]
drugdiseases_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_drugcentral_associations.csv', index=None)