Merge all fetched datasets originating from Monarch Initiative, TTD and DrugCentral.

Adapted by Rosa Zwart from Jupyter Notebooks found at https://github.com/PPerdomoQ/rare-disease-explainer

In [1]:
import pandas as pd
import numpy as np

import util.constants as constants

#### **Format Monarch data**

In [2]:
monarch_edges = pd.read_csv('prev/monarch/monarch_connections_v2024-01-30.csv')
monarch_edges.drop_duplicates(inplace=True)
monarch_edges

Unnamed: 0,subject_id,subject_label,relation_id,relation_label,object_id,object_label,reference_id_list
0,ENSEMBL:ENSRNOG00000030224,A0A0G2K7M1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSECAG00000003944,ENSEMBL:ENSECAG00000003944,
1,SGD:S000003272,KSS1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSCAFG00000031581,MAPK3,
2,ENSEMBL:ENSSSCG00000021408,TKT,RO:HOM0000020,in 1 to 1 orthology relationship with,WormBase:WBGene00008506,tkt-1,
3,MGI:1351465,G3bp1,RO:0002200,has phenotype,MP:0011110,"preweaning lethality, incomplete penetrance",
4,RGD:1359718,Wdr45,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000044470,ENSEMBL:ENSOANG00000044470,
...,...,...,...,...,...,...,...
112494,RGD:1595815,RT1-M6-2,RO:HOM0000017,in orthology relationship with,MGI:95931,H2-Q2,
112495,ZFIN:ZDB-GENE-040801-7,sumo2b,RO:0002434,interacts with,ZFIN:ZDB-GENE-060929-492,rangap1a,
112496,ENSEMBL:ENSSSCG00000035676,G3BP1,RO:HOM0000020,in 1 to 1 orthology relationship with,SGD:S000005334,BRE5,
112497,RGD:1595924,RT1-A1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSBTAG00000002069,BOLA,


In [4]:
monarch_nodes = pd.read_csv('prev/monarch/monarch_nodes_v2024-01-30.csv')
monarch_nodes.drop_duplicates(inplace=True)
monarch_nodes

Unnamed: 0,id,semantic_groups,preflabel,synonyms,description,name
0,ENSEMBL:ENSRNOG00000030224,ORTH,A0A0G2K7M1,,,A0A0G2K7M1
1,ENSEMBL:ENSECAG00000003944,ORTH,ENSEMBL:ENSECAG00000003944,,,ENSEMBL:ENSECAG00000003944
2,SGD:S000003272,ORTH,KSS1,,,KSS1
3,ENSEMBL:ENSCAFG00000031581,ORTH,MAPK3,ERK-1|ERK1|ERT2|HS44KDAP|HUMKER1A|P44ERK1|P44M...,The protein encoded by this gene is a member o...,mitogen-activated protein kinase 3
4,ENSEMBL:ENSSSCG00000021408,ORTH,TKT,MIG20a|NTRKR3|TKT|TYRO10|WRCN,This gene encodes a member of the discoidin do...,discoidin domain receptor tyrosine kinase 2
...,...,...,...,...,...,...
7785,MONARCH:APO_0000220APO_0000003,GENO,necrotic cell death:decreased,,,necrotic cell death:decreased
7786,ZP:0011205,DISO,"cerebellum hypoplastic, abnormal",,,"cerebellum hypoplastic, abnormal"
7787,HP:0003097,DISO,Short femur,,,Short femur
7788,MP:0001700,DISO,abnormal embryo turning,,,abnormal embryo turning


In [5]:
for i, row in monarch_edges.iterrows():
    monarch_edges.loc[i,'id'] = f'MONARCH{i}'
    
    subject_id = row['subject_id']

    monarch_edges.loc[i,'subject_iri'] = np.nan
    monarch_edges.loc[i,'subject_category'] = monarch_nodes.loc[monarch_nodes['id'] == subject_id]['semantic_groups'].values[0]
    monarch_edges.loc[i,'subject_taxon_id'] = np.nan
    monarch_edges.loc[i,'subject_taxon_label'] = np.nan
    
    object_id = row['object_id']
    
    monarch_edges.loc[i,'object_iri'] = np.nan
    monarch_edges.loc[i,'object_category'] = monarch_nodes.loc[monarch_nodes['id'] == object_id]['semantic_groups'].values[0]
    monarch_edges.loc[i,'object_taxon_id'] = np.nan
    monarch_edges.loc[i,'object_taxon_label'] = np.nan
    
    monarch_edges.loc[i,'relation_iri'] = np.nan

In [6]:
monarch_edges

Unnamed: 0,subject_id,subject_label,relation_id,relation_label,object_id,object_label,reference_id_list,id,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_iri
0,ENSEMBL:ENSRNOG00000030224,A0A0G2K7M1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSECAG00000003944,ENSEMBL:ENSECAG00000003944,,MONARCH0,,ORTH,,,,ORTH,,,
1,SGD:S000003272,KSS1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSCAFG00000031581,MAPK3,,MONARCH1,,ORTH,,,,ORTH,,,
2,ENSEMBL:ENSSSCG00000021408,TKT,RO:HOM0000020,in 1 to 1 orthology relationship with,WormBase:WBGene00008506,tkt-1,,MONARCH2,,ORTH,,,,ORTH,,,
3,MGI:1351465,G3bp1,RO:0002200,has phenotype,MP:0011110,"preweaning lethality, incomplete penetrance",,MONARCH3,,ORTH,,,,DISO,,,
4,RGD:1359718,Wdr45,RO:HOM0000020,in 1 to 1 orthology relationship with,ENSEMBL:ENSOANG00000044470,ENSEMBL:ENSOANG00000044470,,MONARCH4,,ORTH,,,,ORTH,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112494,RGD:1595815,RT1-M6-2,RO:HOM0000017,in orthology relationship with,MGI:95931,H2-Q2,,MONARCH112494,,ORTH,,,,ORTH,,,
112495,ZFIN:ZDB-GENE-040801-7,sumo2b,RO:0002434,interacts with,ZFIN:ZDB-GENE-060929-492,rangap1a,,MONARCH112495,,ORTH,,,,ORTH,,,
112496,ENSEMBL:ENSSSCG00000035676,G3BP1,RO:HOM0000020,in 1 to 1 orthology relationship with,SGD:S000005334,BRE5,,MONARCH112496,,ORTH,,,,ORTH,,,
112497,RGD:1595924,RT1-A1,RO:HOM0000017,in orthology relationship with,ENSEMBL:ENSBTAG00000002069,BOLA,,MONARCH112497,,ORTH,,,,ORTH,,,


In [7]:
monarch_associations_df = monarch_edges[list(constants.assoc_tuple_values)]
monarch_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_monarch_associations.csv', index=None)

#### **Format TTD data**

Save a list of all associations between DRUG and GENE in a csv file.

In [8]:
drug_targets = pd.read_csv('prev/drug.target.final.ELA.csv')
drug_targets

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
0,adenosine triphosphate,91,Heat shock cognate 71 kDa protein,Unclassified,P11142,HSPA8,HSP7C_HUMAN,4.760,,IC50,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:5241
1,adenosine triphosphate,91,Heat shock 70 kDa protein 1A,Unclassified,P0DMV8,HSPA1A,HS71A_HUMAN,5.470,,IC50,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:5232
2,amodiaquine,186,Amyloid beta A4 protein,Unclassified,P05067,APP,A4_HUMAN,5.400,,IC50,...,CHEMBL,=,,,,,,,Homo sapiens,HGNC:620
3,anileridine,220,Mu-type opioid receptor,GPCR,P35372,OPRM1,OPRM_HUMAN,,,,...,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,AGONIST,Tclin,Homo sapiens,HGNC:8156
4,aprindine,231,Calmodulin,Cytosolic other,P0DP23|P0DP24|P0DP25,CALM1|CALM2|CALM3,CALM1_HUMAN|CALM2_HUMAN|CALM3_HUMAN,4.745,,ID50,...,WOMBAT-PK,=,1.0,SCIENTIFIC LITERATURE,,https://pubmed.ncbi.nlm.nih.gov/6186851,INHIBITOR,Tclin|Tclin|Tclin,Homo sapiens,HGNC:1442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,selumetinib,5388,Epidermal growth factor receptor,Kinase,P00533,EGFR,EGFR_HUMAN,5.660,,Kd,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:3236
154,tucatinib,5389,Epidermal growth factor receptor,Kinase,P00533,EGFR,EGFR_HUMAN,6.350,,IC50,...,IUPHAR,=,,,,,INHIBITOR,Tclin,Homo sapiens,HGNC:3236
155,oliceridine,5406,Mu-type opioid receptor,GPCR,P35372,OPRM1,OPRM_HUMAN,8.220,,Ki,...,DRUG LABEL,=,1.0,DRUG LABEL,https://www.accessdata.fda.gov/drugsatfda_docs...,https://www.accessdata.fda.gov/drugsatfda_docs...,AGONIST,Tclin,Homo sapiens,HGNC:8156
156,trilaciclib,5442,Cyclin-dependent-like kinase 5,Kinase,Q00535,CDK5,CDK5_HUMAN,5.830,,IC50,...,DRUG LABEL,=,,,https://www.accessdata.fda.gov/drugsatfda_docs...,,INHIBITOR,Tchem,Homo sapiens,HGNC:1774


Get columns that are relevant.

In [9]:
drug_targets_values = drug_targets.copy()[['STRUCT_ID', 'DRUG_NAME', 'NEW_ID']]
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID
0,91,adenosine triphosphate,HGNC:5241
1,91,adenosine triphosphate,HGNC:5232
2,186,amodiaquine,HGNC:620
3,220,anileridine,HGNC:8156
4,231,aprindine,HGNC:1442
...,...,...,...
153,5388,selumetinib,HGNC:3236
154,5389,tucatinib,HGNC:3236
155,5406,oliceridine,HGNC:8156
156,5442,trilaciclib,HGNC:1774


In [10]:
drug_targets_values.drop_duplicates(inplace=True)
drug_targets_values.shape[0]

150

Add new columns that are needed to be included in the dataframe.

In [11]:
for i, row in drug_targets_values.iterrows():
    drug_targets_values.loc[i,'id'] = f'TTD{i}'
    
    drug_targets_values.loc[i,'subject_id'] = str(row['STRUCT_ID'])
    drug_targets_values.loc[i,'subject_label'] = row['DRUG_NAME']
    drug_targets_values.loc[i,'subject_iri'] = np.nan
    drug_targets_values.loc[i,'subject_category'] = 'DRUG'
    drug_targets_values.loc[i,'subject_taxon_id'] = np.nan
    drug_targets_values.loc[i,'subject_taxon_label'] = np.nan
    
    drug_targets_values.loc[i,'object_id'] = row['NEW_ID']
    drug_targets_values.loc[i,'object_label'] = np.nan
    drug_targets_values.loc[i,'object_iri'] = np.nan
    drug_targets_values.loc[i,'object_category'] = np.nan
    drug_targets_values.loc[i,'object_taxon_id'] = np.nan
    drug_targets_values.loc[i,'object_taxon_label'] = np.nan
    
    drug_targets_values.loc[i,'relation_id'] = 'CustomRO:TTD'
    drug_targets_values.loc[i,'relation_label'] = 'targets'
    drug_targets_values.loc[i,'relation_iri'] = np.nan

In [12]:
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,91,adenosine triphosphate,HGNC:5241,TTD0,91,adenosine triphosphate,,DRUG,,,HGNC:5241,,,,,,CustomRO:TTD,targets,
1,91,adenosine triphosphate,HGNC:5232,TTD1,91,adenosine triphosphate,,DRUG,,,HGNC:5232,,,,,,CustomRO:TTD,targets,
2,186,amodiaquine,HGNC:620,TTD2,186,amodiaquine,,DRUG,,,HGNC:620,,,,,,CustomRO:TTD,targets,
3,220,anileridine,HGNC:8156,TTD3,220,anileridine,,DRUG,,,HGNC:8156,,,,,,CustomRO:TTD,targets,
4,231,aprindine,HGNC:1442,TTD4,231,aprindine,,DRUG,,,HGNC:1442,,,,,,CustomRO:TTD,targets,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,5388,selumetinib,HGNC:3236,TTD153,5388,selumetinib,,DRUG,,,HGNC:3236,,,,,,CustomRO:TTD,targets,
154,5389,tucatinib,HGNC:3236,TTD154,5389,tucatinib,,DRUG,,,HGNC:3236,,,,,,CustomRO:TTD,targets,
155,5406,oliceridine,HGNC:8156,TTD155,5406,oliceridine,,DRUG,,,HGNC:8156,,,,,,CustomRO:TTD,targets,
156,5442,trilaciclib,HGNC:1774,TTD156,5442,trilaciclib,,DRUG,,,HGNC:1774,,,,,,CustomRO:TTD,targets,


In [13]:
drugtarget_associations_df = drug_targets_values[list(constants.assoc_tuple_values)]
drugtarget_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_ttd_associations.csv', index=None)

#### **Format DrugCentral data**

In [14]:
drug_diseases = pd.read_csv('prev/drug_to_disease_final_v2_ELA.csv')
drug_diseases

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
0,1302,91,Adenosine triphosphate,Bradycardia,Discontinued in Phase 2,bradycardia,Bradycardia,http://purl.obolibrary.org/obo/HP_0001662,False,False,HP:0001662
1,13599,522,Carvedilol,Congestive heart failure,Approved,congestive heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,False,False,HP:0001635
2,18485,2817,VESNARINONE,Cardiac failure,Approved,cardiac failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,False,False,HP:0001635
3,21288,2123,Phenelzine,Depression,Approved,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,False,False,HP:0000716
4,23772,4468,Esketamine,Depression,Phase 3,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,False,False,HP:0000716
5,25467,522,Carvedilol,Congestive heart failure,Approved,congestive heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,False,False,HP:0001635


In [15]:
drug_diseases_values = drug_diseases.copy()[['DRUG_ID', 'ID']]
drug_diseases_values

Unnamed: 0,DRUG_ID,ID
0,91,HP:0001662
1,522,HP:0001635
2,2817,HP:0001635
3,2123,HP:0000716
4,4468,HP:0000716
5,522,HP:0001635


In [16]:
for i, row in drug_diseases_values.iterrows():
    drug_diseases_values.loc[i,'id'] = f'DC{i}'
    
    drug_diseases_values.loc[i,'subject_id'] = row['DRUG_ID']
    drug_diseases_values.loc[i,'subject_label'] = np.nan
    drug_diseases_values.loc[i,'subject_iri'] = np.nan
    drug_diseases_values.loc[i,'subject_category'] = 'DRUG'
    drug_diseases_values.loc[i,'subject_taxon_id'] = np.nan
    drug_diseases_values.loc[i,'subject_taxon_label'] = np.nan
    
    drug_diseases_values.loc[i,'object_id'] = row['ID']
    drug_diseases_values.loc[i,'object_label'] = np.nan
    drug_diseases_values.loc[i,'object_iri'] = np.nan
    drug_diseases_values.loc[i,'object_category'] = np.nan
    drug_diseases_values.loc[i,'object_taxon_id'] = np.nan
    drug_diseases_values.loc[i,'object_taxon_label'] = np.nan
    
    drug_diseases_values.loc[i,'relation_id'] = 'CustomRO:DC'
    drug_diseases_values.loc[i,'relation_label'] = 'is substance that treats'
    drug_diseases_values.loc[i,'relation_iri'] = np.nan

In [17]:
drug_diseases_values

Unnamed: 0,DRUG_ID,ID,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,91,HP:0001662,DC0,91.0,,,DRUG,,,HP:0001662,,,,,,CustomRO:DC,is substance that treats,
1,522,HP:0001635,DC1,522.0,,,DRUG,,,HP:0001635,,,,,,CustomRO:DC,is substance that treats,
2,2817,HP:0001635,DC2,2817.0,,,DRUG,,,HP:0001635,,,,,,CustomRO:DC,is substance that treats,
3,2123,HP:0000716,DC3,2123.0,,,DRUG,,,HP:0000716,,,,,,CustomRO:DC,is substance that treats,
4,4468,HP:0000716,DC4,4468.0,,,DRUG,,,HP:0000716,,,,,,CustomRO:DC,is substance that treats,
5,522,HP:0001635,DC5,522.0,,,DRUG,,,HP:0001635,,,,,,CustomRO:DC,is substance that treats,


In [18]:
drugdiseases_associations_df = drug_diseases_values[list(constants.assoc_tuple_values)]
drugdiseases_associations_df.to_csv(f'{constants.OUTPUT_FOLDER}/prev_drugcentral_associations.csv', index=None)