Merge all fetched datasets originating from Monarch Initiative, TTD and DrugCentral.

Adapted by Rosa Zwart from Jupyter Notebook found at https://github.com/PPerdomoQ/rare-disease-explainer/blob/main/3_Predictions_and_explanations.ipynb

In [4]:
import os
import sys
import pandas as pd
import numpy as np

current_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(current_dir))
sys.path.append(project_root)

import util.constants as constants

In [6]:
DISEASE_PREFIX = 'oi'

#### **Format TTD data**

Save a list of all associations between DRUG and GENE in a csv file.

In [7]:
drug_targets = pd.read_csv(f'output/matched_drug_targets_{DISEASE_PREFIX}.csv')
drug_targets

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
0,aclarubicin,80,72 kDa type IV collagenase,Enzyme,P08253,MMP2,MMP2_HUMAN,5.00,,IC50,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:7166
1,aldosterone,111,Sex hormone-binding globulin,Secreted,P04278,SHBG,SHBG_HUMAN,5.32,,Kd,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:10839
2,aminoquinuride,174,Matrix metalloproteinase-9,Enzyme,P14780,MMP9,MMP9_HUMAN,4.00,,IC50,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:7176
3,androstenediol,214,Sex hormone-binding globulin,Secreted,P04278,SHBG,SHBG_HUMAN,9.17,,Kd,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:10839
4,androstenedione,215,Sex hormone-binding globulin,Secreted,P04278,SHBG,SHBG_HUMAN,7.46,,Kd,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:10839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,pralsetinib,5412,Platelet-derived growth factor receptor beta,Kinase,P09619,PDGFRB,PGFRB_HUMAN,7.00,,IC50,...,DRUG LABEL,=,,,https://www.accessdata.fda.gov/drugsatfda_docs...,,INHIBITOR,Tclin,Homo sapiens,HGNC:8804
204,filgotinib,5419,Vascular endothelial growth factor receptor 3,Kinase,P35916,FLT4,VGFR3_HUMAN,6.56,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:3767
205,filgotinib,5419,Vascular endothelial growth factor receptor 3,Kinase,P35916,FLT4,VGFR3_HUMAN,6.56,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:3767
206,infigratinib,5459,Vascular endothelial growth factor receptor 3,Kinase,P35916,FLT4,VGFR3_HUMAN,5.68,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:3767


Get columns that are relevant.

In [8]:
drug_targets_values = drug_targets.copy()[['STRUCT_ID', 'DRUG_NAME', 'NEW_ID']]
drug_targets_values

Unnamed: 0,STRUCT_ID,DRUG_NAME,NEW_ID
0,80,aclarubicin,HGNC:7166
1,111,aldosterone,HGNC:10839
2,174,aminoquinuride,HGNC:7176
3,214,androstenediol,HGNC:10839
4,215,androstenedione,HGNC:10839
...,...,...,...
203,5412,pralsetinib,HGNC:8804
204,5419,filgotinib,HGNC:3767
205,5419,filgotinib,HGNC:3767
206,5459,infigratinib,HGNC:3767


In [9]:
drug_targets_values.drop_duplicates(inplace=True)
drug_targets_values.shape[0]

203

Add new columns that are needed to be included in the dataframe.

In [10]:
dict_list = []

for i, row in drug_targets_values.iterrows():
    tuple_dict = {
        'id': f'TTD{i}',
        'subject_id': str(row['STRUCT_ID']),
        'subject_label': row['DRUG_NAME'],
        'subject_iri': np.nan,
        'subject_category': 'DRUG',
        'subject_taxon_id': np.nan,
        'subject_taxon_label': np.nan,
        'object_id': row['NEW_ID'],
        'object_label': np.nan,
        'object_iri': np.nan,
        'object_category': np.nan,
        'object_taxon_id': np.nan,
        'object_taxon_label': np.nan,
        'relation_id': 'CustomRO:TTD',
        'relation_label': 'targets',
        'relation_iri': np.nan
    }

    dict_list.append(tuple_dict)

drugtarget_associations_df = pd.DataFrame.from_dict(dict_list) 

In [11]:
drugtarget_associations_df

Unnamed: 0,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,TTD0,80,aclarubicin,,DRUG,,,HGNC:7166,,,,,,CustomRO:TTD,targets,
1,TTD1,111,aldosterone,,DRUG,,,HGNC:10839,,,,,,CustomRO:TTD,targets,
2,TTD2,174,aminoquinuride,,DRUG,,,HGNC:7176,,,,,,CustomRO:TTD,targets,
3,TTD3,214,androstenediol,,DRUG,,,HGNC:10839,,,,,,CustomRO:TTD,targets,
4,TTD4,215,androstenedione,,DRUG,,,HGNC:10839,,,,,,CustomRO:TTD,targets,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,TTD201,5389,tucatinib,,DRUG,,,HGNC:3236,,,,,,CustomRO:TTD,targets,
199,TTD202,5394,ripretinib,,DRUG,,,HGNC:8804,,,,,,CustomRO:TTD,targets,
200,TTD203,5412,pralsetinib,,DRUG,,,HGNC:8804,,,,,,CustomRO:TTD,targets,
201,TTD204,5419,filgotinib,,DRUG,,,HGNC:3767,,,,,,CustomRO:TTD,targets,


In [12]:
drugtarget_associations_df.to_csv(f'../../{constants.OUTPUT_FOLDER}/{DISEASE_PREFIX}/prev_{DISEASE_PREFIX}_ttd_associations.csv', index=None)

#### **Format DrugCentral data**

In [13]:
drug_diseases = pd.read_csv(f'output/matched_drug_to_disease_{DISEASE_PREFIX}.csv')
drug_diseases

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
0,878,1932,Nilotinib,Chronic myelogenous leukaemia,Approved,chronic myelogenous leukaemia,Chronic myelogenous leukemia,http://purl.obolibrary.org/obo/HP_0005506,False,False,HP:0005506
1,1167,1879,Nandrolone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
2,1291,542,Cefonicid,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
3,1945,574,Cefalotin,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
4,2262,2351,Raloxifene,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
5,5006,4138,Gentian violet,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
6,7442,576,Cefradine,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
7,8028,2607,Testosterone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
8,8752,4977,Siltuximab,Anemia,Approved,anemia,Anemia,http://purl.obolibrary.org/obo/HP_0001903,False,False,HP:0001903
9,9195,960,Doxorubicin,Tumour,Investigative,tumour,Neoplasm,http://purl.obolibrary.org/obo/HP_0002664,False,False,HP:0002664


In [14]:
drug_diseases_values = drug_diseases.copy()[['DRUG_ID', 'ID']]
drug_diseases_values

Unnamed: 0,DRUG_ID,ID
0,1932,HP:0005506
1,1879,HP:0000939
2,542,HP:0100658
3,574,HP:0100658
4,2351,HP:0000939
5,4138,HP:0100658
6,576,HP:0100658
7,2607,HP:0000939
8,4977,HP:0001903
9,960,HP:0002664


In [15]:
dict_list = []

for i, row in drug_diseases_values.iterrows():
    tuple_dict = {
        'id': f'DC{i}',
        'subject_id': row['DRUG_ID'],
        'subject_label': np.nan,
        'subject_iri': np.nan,
        'subject_category': 'DRUG',
        'subject_taxon_id': np.nan,
        'subject_taxon_label': np.nan,
        'object_id': row['ID'],
        'object_label': np.nan,
        'object_iri': np.nan,
        'object_category': np.nan,
        'object_taxon_id': np.nan,
        'object_taxon_label': np.nan,
        'relation_id': 'CustomRO:DC',
        'relation_label': 'is substance that treats',
        'relation_iri': np.nan
    }

    dict_list.append(tuple_dict)

drugdiseases_associations_df = pd.DataFrame.from_dict(dict_list) 

In [16]:
drugdiseases_associations_df.head(10)

Unnamed: 0,id,subject_id,subject_label,subject_iri,subject_category,subject_taxon_id,subject_taxon_label,object_id,object_label,object_iri,object_category,object_taxon_id,object_taxon_label,relation_id,relation_label,relation_iri
0,DC0,1932,,,DRUG,,,HP:0005506,,,,,,CustomRO:DC,is substance that treats,
1,DC1,1879,,,DRUG,,,HP:0000939,,,,,,CustomRO:DC,is substance that treats,
2,DC2,542,,,DRUG,,,HP:0100658,,,,,,CustomRO:DC,is substance that treats,
3,DC3,574,,,DRUG,,,HP:0100658,,,,,,CustomRO:DC,is substance that treats,
4,DC4,2351,,,DRUG,,,HP:0000939,,,,,,CustomRO:DC,is substance that treats,
5,DC5,4138,,,DRUG,,,HP:0100658,,,,,,CustomRO:DC,is substance that treats,
6,DC6,576,,,DRUG,,,HP:0100658,,,,,,CustomRO:DC,is substance that treats,
7,DC7,2607,,,DRUG,,,HP:0000939,,,,,,CustomRO:DC,is substance that treats,
8,DC8,4977,,,DRUG,,,HP:0001903,,,,,,CustomRO:DC,is substance that treats,
9,DC9,960,,,DRUG,,,HP:0002664,,,,,,CustomRO:DC,is substance that treats,


In [17]:
drugdiseases_associations_df.to_csv(f'../../{constants.OUTPUT_FOLDER}/{DISEASE_PREFIX}/prev_{DISEASE_PREFIX}_drugcentral_associations.csv', index=None)