Merge all fetched datasets originating from Monarch Initiative, TTD and DrugCentral.

Adapted by Rosa Zwart from Jupyter Notebook found at https://github.com/PPerdomoQ/rare-disease-explainer/blob/main/3_Predictions_and_explanations.ipynb

In [1]:
import os
import sys
import pandas as pd
import numpy as np

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(project_root)

import util.constants as constants

In [2]:
DISEASE_PREFIX = 'dmd'

#### **Format TTD data**

Save a list of all associations between DRUG and GENE in a csv file.

In [3]:
drug_targets = pd.read_csv(f'output/matched_drug_targets_{DISEASE_PREFIX}.csv')
drug_targets

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
0,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,Q01668,CACNA1D,CAC1D_HUMAN,8.40,,IC50,...,IUPHAR,=,1.0,IUPHAR,,https://www.ebi.ac.uk/chembl/compound/inspect/...,GATING INHIBITOR,Tclin,Homo sapiens,HGNC:1391
1,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,O60840,CACNA1F,CAC1F_HUMAN,6.00,,IC50,...,IUPHAR,~,,,,,GATING INHIBITOR,Tclin,Homo sapiens,HGNC:1393
2,(S)-nitrendipine,6,Voltage-dependent L-type calcium channel subun...,Ion channel,Q13936,CACNA1C,CAC1C_HUMAN,,,,...,SCIENTIFIC LITERATURE,,1.0,SCIENTIFIC LITERATURE,,https://pubmed.ncbi.nlm.nih.gov/17276408,BLOCKER,Tclin,Homo sapiens,HGNC:1390
3,phenylbutanoic acid,24,Histone deacetylase 2,Enzyme,Q92769,HDAC2,HDAC2_HUMAN,4.19,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:4853
4,acetylcholine,65,Muscarinic acetylcholine receptor M1,GPCR,P11229,CHRM1,ACM1_HUMAN,6.11,,EC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:1950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,infigratinib,5459,Proto-oncogene tyrosine-protein kinase Src,Kinase,P12931,SRC,SRC_HUMAN,5.53,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:11283
387,infigratinib,5459,Tyrosine-protein kinase Lyn,Kinase,P07948,LYN,LYN_HUMAN,6.52,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:6735
388,infigratinib,5459,Proto-oncogene tyrosine-protein kinase Src,Kinase,P12931,SRC,SRC_HUMAN,5.53,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:11283
389,infigratinib,5459,Tyrosine-protein kinase Lyn,Kinase,P07948,LYN,LYN_HUMAN,6.52,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:6735


Get columns that are relevant.

In [4]:
drug_targets_values = drug_targets.copy()[['STRUCT_ID', 'DRUG_NAME', 'NEW_ID']]
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID
0,6,(S)-nitrendipine,HGNC:1391
1,6,(S)-nitrendipine,HGNC:1393
2,6,(S)-nitrendipine,HGNC:1390
3,24,phenylbutanoic acid,HGNC:4853
4,65,acetylcholine,HGNC:1950
...,...,...,...
386,5459,infigratinib,HGNC:11283
387,5459,infigratinib,HGNC:6735
388,5459,infigratinib,HGNC:11283
389,5459,infigratinib,HGNC:6735


In [5]:
drug_targets_values.drop_duplicates(inplace=True)
drug_targets_values.shape[0]

380

Add new columns that are needed to be included in the dataframe.

In [6]:
dict_list = []

for i, row in drug_targets_values.iterrows():
    tuple_dict = {
        'id': f'TTD{i}',
        'subject_id': str(row['STRUCT_ID']),
        'subject_label': row['DRUG_NAME'],
        'subject_iri': np.nan,
        'subject_category': 'DRUG',
        'subject_taxon_id': np.nan,
        'subject_taxon_label': np.nan,
        'object_id': row['NEW_ID'],
        'object_label': np.nan,
        'object_iri': np.nan,
        'object_category': np.nan,
        'object_taxon_id': np.nan,
        'object_taxon_label': np.nan,
        'relation_id': 'CustomRO:TTD',
        'relation_label': 'targets',
        'relation_iri': np.nan
    }

    dict_list.append(tuple_dict)

drugtarget_associations_df = pd.DataFrame.from_dict(dict_list) 

In [7]:
drugtarget_associations_df

Unnamed: 0,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,TTD0,6,(S)-nitrendipine,,DRUG,,,HGNC:1391,,,,,,CustomRO:TTD,targets,
1,TTD1,6,(S)-nitrendipine,,DRUG,,,HGNC:1393,,,,,,CustomRO:TTD,targets,
2,TTD2,6,(S)-nitrendipine,,DRUG,,,HGNC:1390,,,,,,CustomRO:TTD,targets,
3,TTD3,24,phenylbutanoic acid,,DRUG,,,HGNC:4853,,,,,,CustomRO:TTD,targets,
4,TTD4,65,acetylcholine,,DRUG,,,HGNC:1950,,,,,,CustomRO:TTD,targets,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,TTD381,5431,tirbanibulin,,DRUG,,,HGNC:11283,,,,,,CustomRO:TTD,targets,
376,TTD382,5431,tirbanibulin,,DRUG,,,HGNC:6735,,,,,,CustomRO:TTD,targets,
377,TTD385,5443,casimersen,,DRUG,,,HGNC:2928,,,,,,CustomRO:TTD,targets,
378,TTD386,5459,infigratinib,,DRUG,,,HGNC:11283,,,,,,CustomRO:TTD,targets,


In [8]:
drugtarget_associations_df.to_csv(f'../../{constants.OUTPUT_FOLDER}/{DISEASE_PREFIX}/prev_{DISEASE_PREFIX}_ttd_associations.csv', index=None)

#### **Format DrugCentral data**

In [9]:
drug_diseases = pd.read_csv(f'output/matched_drug_to_disease_{DISEASE_PREFIX}.csv')
drug_diseases

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
0,975,4483,Carfilzomib,Small-cell lung cancer,Phase 1/2,small cell lung cancer,Small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030357,False,False,HP:0030357
1,2136,1043,Ergotamine,Headache,Approved,headache,Headache,http://purl.obolibrary.org/obo/HP_0002315,False,False,HP:0002315
2,2168,812,Desipramine,Attention deficit hyperactivity disorder,Approved,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,False,False,HP:0007018
3,2389,4225,Axitinib,Renal cell carcinoma,Approved,renal cell carcinoma,Renal cell carcinoma,http://purl.obolibrary.org/obo/HP_0005584,False,False,HP:0005584
4,2632,4175,Linagliptin,Type-2 diabetes,Approved,type 2 diabetes,Type II diabetes mellitus,http://purl.obolibrary.org/obo/HP_0005978,False,False,HP:0005978
...,...,...,...,...,...,...,...,...,...,...,...
68,25482,313,Diphenidol,Nausea,Approved,nausea,Nausea,http://purl.obolibrary.org/obo/HP_0002018,False,False,HP:0002018
69,25591,1142,Felodipine,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
70,26312,2630,Thiethylperazine,Nausea,Approved,nausea,Nausea,http://purl.obolibrary.org/obo/HP_0002018,False,False,HP:0002018
71,27057,318,Benzbromarone,Gout,Approved,gout,Gout,http://purl.obolibrary.org/obo/HP_0001997,False,False,HP:0001997


In [10]:
drug_diseases_values = drug_diseases.copy()[['DRUG_ID', 'ID']]
drug_diseases_values

Unnamed: 0,DRUG_ID,ID
0,4483,HP:0030357
1,1043,HP:0002315
2,812,HP:0007018
3,4225,HP:0005584
4,4175,HP:0005978
...,...,...
68,313,HP:0002018
69,1142,HP:0000822
70,2630,HP:0002018
71,318,HP:0001997


In [11]:
dict_list = []

for i, row in drug_diseases_values.iterrows():
    tuple_dict = {
        'id': f'DC{i}',
        'subject_id': row['DRUG_ID'],
        'subject_label': np.nan,
        'subject_iri': np.nan,
        'subject_category': 'DRUG',
        'subject_taxon_id': np.nan,
        'subject_taxon_label': np.nan,
        'object_id': row['ID'],
        'object_label': np.nan,
        'object_iri': np.nan,
        'object_category': np.nan,
        'object_taxon_id': np.nan,
        'object_taxon_label': np.nan,
        'relation_id': 'CustomRO:DC',
        'relation_label': 'is substance that treats',
        'relation_iri': np.nan
    }

    dict_list.append(tuple_dict)

drugdiseases_associations_df = pd.DataFrame.from_dict(dict_list) 

In [12]:
drugdiseases_associations_df.head(10)

Unnamed: 0,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,DC0,4483,,,DRUG,,,HP:0030357,,,,,,CustomRO:DC,is substance that treats,
1,DC1,1043,,,DRUG,,,HP:0002315,,,,,,CustomRO:DC,is substance that treats,
2,DC2,812,,,DRUG,,,HP:0007018,,,,,,CustomRO:DC,is substance that treats,
3,DC3,4225,,,DRUG,,,HP:0005584,,,,,,CustomRO:DC,is substance that treats,
4,DC4,4175,,,DRUG,,,HP:0005978,,,,,,CustomRO:DC,is substance that treats,
5,DC5,4175,,,DRUG,,,HP:0005978,,,,,,CustomRO:DC,is substance that treats,
6,DC6,5431,,,DRUG,,,HP:0012125,,,,,,CustomRO:DC,is substance that treats,
7,DC7,1683,,,DRUG,,,HP:0012125,,,,,,CustomRO:DC,is substance that treats,
8,DC8,4175,,,DRUG,,,HP:0005978,,,,,,CustomRO:DC,is substance that treats,
9,DC9,4175,,,DRUG,,,HP:0005978,,,,,,CustomRO:DC,is substance that treats,


In [13]:
drugdiseases_associations_df.to_csv(f'../../{constants.OUTPUT_FOLDER}/{DISEASE_PREFIX}/prev_{DISEASE_PREFIX}_drugcentral_associations.csv', index=None)