Taken from https://github.com/PPerdomoQ/rare-disease-explainer/blob/main/2_Add_drug_info.ipynb with adjustments

## **Import**

In [2]:
import re
import pandas as pd
import numpy as np

import requests
import time

## Parameters

In [3]:
DISEASE_PREFIX = 'oi'

## **Drug-Target Information**

### **Load Files**

Load the information from Drug Central and Monarch: 

In [17]:
df = pd.read_csv('../../input/drug.target.interaction.tsv', header=0, index_col=0, sep='\t')
nodes = pd.read_csv(f'output/prev_{DISEASE_PREFIX}_monarch_nodes.csv', header=0)

In [18]:
print(len(nodes))
nodes.head(5)

9732


Unnamed: 0,id,semantic_groups,name
0,ENSEMBL:ENSCAFG00000008883,ORTH,TLL1
1,ENSEMBL:ENSECAG00000025065,ORTH,ENSEMBL:ENSECAG00000025065
2,HGNC:8799,GENE,PDGFA
3,ZFIN:ZDB-GENE-100623-1,ORTH,thbs2b
4,ZP:0002795,DISO,"axon MiP motor neuron absent, abnormal"


In [19]:
df

Unnamed: 0_level_0,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
DRUG_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.890,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,,,,Tclin,Homo sapiens
levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.790,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens
levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,OPRD_HUMAN,8.590,,Ki,,DRUG LABEL,=,,,https://www.accessdata.fda.gov/drugsatfda_docs...,,PARTIAL AGONIST,Tclin,Homo sapiens
sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,7.030,,IC50,KRAS G12C mutation,DRUG LABEL,=,1.0,DRUG LABEL,https://www.accessdata.fda.gov/drugsatfda_docs...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,Tchem,Homo sapiens
ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,O13428_CANAX,8.350,,IC50,"MoA - inhibits the biosynthesis of beta-(1,3)-...",SCIENTIFIC LITERATURE,=,1.0,DRUG LABEL,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida albicans
ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,Q6FTN8_CANGA,7.830,,IC50,"MoA - inhibits the biosynthesis of beta-(1,3)-...",SCIENTIFIC LITERATURE,=,1.0,SCIENTIFIC LITERATURE,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida glabrata


In [20]:
df['NEW_ID'] = ''

### **Uniprot to Monarch IDs**

Use Uniprot API to obtain new target IDs: 

In [21]:
"""
    @description: This module maps UniProt accession IDs to other databases depending on taxon. 
    @source: https://www.uniprot.org/help/id_mapping
    @author: Rosa Zwart
"""

RETRIES = 3
POLLING_S_INTERVAL = 5

FROM_DB = 'UniProtKB_AC-ID'
DEFAULT_TO_DB = 'Ensembl'

db_mapper = {
    'Homo sapiens': 'HGNC',
    'Rattus norvegicus': 'RGD',
    'Mus musculus': 'MGI',
    'Drosophila melanogaster': 'FlyBase',
    'Caenorhabditis elegans': 'WormBase',
    'Danio rerio': 'ZFIN',
    'Escherichia coli': 'EnsemblGenome',
    'Xenopus tropicalis': 'Xenbase',
    'Dictyostelium discoideum': 'dictyBase',
    'Saccharomyces cerevisiae S288C': 'SGD',
    'Schizosaccharomyces pombe': 'PomBase'
}

class IdMapper:
    def __init__(self, ids_to_map: list, to_db = DEFAULT_TO_DB, from_db = FROM_DB):
        self.url = 'https://rest.uniprot.org'
        self.job_id = self.submit_id_mapping(ids_to_map, to_db, from_db)

        if self.check_job_ready():
            self.results = self.get_results()
        
    def submit_id_mapping(self, id_list, to_db, from_db):
        print(f'Map to database {to_db}')
        
        data_params = {
            'from': from_db,
            'to': to_db,
            'ids': ','.join(id_list)
        }
        
        for i in range(RETRIES):
            try:  
                response = requests.post(f'{self.url}/idmapping/run', data=data_params)
                response.raise_for_status()
                return response.json()['jobId']
            except Exception as e:
                if (i < RETRIES - 1):
                    print(f'Retrying in {POLLING_S_INTERVAL}s')
                    time.sleep(POLLING_S_INTERVAL)
                    continue
                else:
                    print(f'After all attempts, request could not be submitted due to {e}')
                    return None
                
    def check_job_ready(self):
        while self.job_id:
            try:
                response = requests.get(f'{self.url}/idmapping/status/{self.job_id}')
                response.raise_for_status()
                response_values = response.json()
                if 'jobStatus' in response_values:
                    if response_values['jobStatus'] == 'RUNNING':
                        print(f'Check again after {POLLING_S_INTERVAL}s')
                        time.sleep(POLLING_S_INTERVAL)
                    
                    elif response_values['jobStatus'] == 'FINISHED':
                        print('Job is finished')
                        return True
                    
                    else:
                        print(f'Job {self.job_id} had status {response_values["jobStatus"]}, stopped checking if job is ready.')
                        return False
                elif 'results' in response_values:
                        return True
                else:
                    return False
            except Exception as e:
                print(f'Failed to check whether job is finished due to {e}, try again after {POLLING_S_INTERVAL}s...')
                time.sleep(POLLING_S_INTERVAL)
                
    def get_results(self):
        for i in range(RETRIES):
            try:  
                response = requests.get(f'{self.url}/idmapping/stream/{self.job_id}')
                response.raise_for_status()
                return response.json()['results']
            except Exception as e:
                if (i < RETRIES - 1):
                    print(f'Retrying in {POLLING_S_INTERVAL}s')
                    time.sleep(POLLING_S_INTERVAL)
                    continue
                else:
                    print(f'After all attempts, request could not be submitted due to {e}')
                    return None

In [22]:
def get_single_id(id):
    """
        Get from list of IDs split by `|`, one ID.
        :return Single ID value
    """
    split_id = id.split('|')
    if len(split_id) > 1:
        return split_id[0]
    else:
        return id

def fetch_id_mappings(entries: pd.DataFrame, map_to_db):
    """
        Get ID mappings of IDs present in given dataframe. Mappings are based on given database to which the IDs need to be mapped.
        :param entries: Dataframe containing column name `ACCESSION`
        :param map_to_db: Name of database to which the given IDs need to be mapped
    """
    if (entries.shape[0] > 0):
        id_entries_to_map = entries.copy()
        id_entries_to_map['ACCESSION'] = entries.apply(lambda row: get_single_id(row['ACCESSION']), axis=1)
        
        mapper = IdMapper(ids_to_map=id_entries_to_map['ACCESSION'].to_list(), to_db=map_to_db)
        if hasattr(mapper, 'results'):
            return mapper.results
        else:
            return []
    else:
        return []

def get_mapped_ids(drug_targets):
    """
        Get mapped IDs for all included databases.
        :param drug_targets: Dataframe that contains column name `ORGANISM` and `ACCESSION`
    """
    all_mapped_id_results = []
    all_taxon_names = list(db_mapper.keys())
    
    for taxon in all_taxon_names:
        relevant_entries = drug_targets[drug_targets['ORGANISM'].str.contains(taxon)]
        id_mappings = fetch_id_mappings(relevant_entries, db_mapper[taxon])
        all_mapped_id_results = all_mapped_id_results + id_mappings

    # Map entity ids of leftover organisms to default database
    other_relevant_entries = drug_targets[~drug_targets['ORGANISM'].isin(all_taxon_names)]
    other_id_mappings = fetch_id_mappings(other_relevant_entries, DEFAULT_TO_DB)
    all_mapped_id_results = all_mapped_id_results + other_id_mappings
                
    return all_mapped_id_results

results = get_mapped_ids(drug_targets=df)

Map to database HGNC
Map to database RGD
Map to database MGI
Map to database FlyBase
Map to database WormBase
Map to database ZFIN
Map to database EnsemblGenome
Retrying in 5s
Retrying in 5s
After all attempts, request could not be submitted due to 400 Client Error:  for url: https://rest.uniprot.org/idmapping/run
Map to database PomBase
Map to database Ensembl


In [23]:
def check_split_values(value, id_value):
    parts = value.split('|')
    return all(part == id_value for part in parts)

for mapping_result in results:
    original_id = mapping_result['from']
    new_id = mapping_result['to']
    
    df.loc[df['ACCESSION'].apply(check_split_values, id_value=original_id), 'NEW_ID'] = new_id

In [24]:
df['NEW_ID'] = df['NEW_ID'].replace('', np.nan)

In [25]:
print(f'Tried to match IDs to ACCESSION IDs for {len(df)} rows.')

Tried to match IDs to ACCESSION IDs for 19378 rows.


Drop rows that don't have a new ID: 

In [26]:
df = df.dropna(subset=['NEW_ID'])
print(f'All found matched IDs to ACCESSION IDs for {len(df)} rows.')

All found matched IDs to ACCESSION IDs for 16141 rows.


Keep only rows whose the target is in the Monarch nodes:

In [27]:
df = df[df['NEW_ID'].isin(list(nodes['id']))]
print(f'All targets that match with Monarch nodes consisting of {len(df)} rows.')

All targets that match with Monarch nodes consisting of 208 rows.


Save the Drug-Target dataframe as .csv: 

In [28]:
df.to_csv(f'output/matched_drug_targets_{DISEASE_PREFIX}.csv', encoding = 'utf-8-sig') 

Create and save a dataframe containing the drug nodes: 

In [30]:
df['DRUG_NAME'] = df.index
df_drugs = df[['DRUG_NAME', 'STRUCT_ID']]
df_drugs = df_drugs.drop_duplicates()
df_drugs.to_csv(f'output/drug_nodes_{DISEASE_PREFIX}.csv', encoding = 'utf-8-sig') 

In [31]:
df_drugs.head(10)

Unnamed: 0_level_0,DRUG_NAME,STRUCT_ID
DRUG_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
aclarubicin,aclarubicin,80
aldosterone,aldosterone,111
aminoquinuride,aminoquinuride,174
androstenediol,androstenediol,214
androstenedione,androstenedione,215
astemizole,astemizole,249
bacitracin,bacitracin,281
beclometasone dipropionate,beclometasone dipropionate,294
budesonide,budesonide,419
busulfan,busulfan,438


In [32]:
len(df_drugs)

153

## **Drug-Disease Information**

### **Text to CSV**

If already saved, run only the cell of this section, otherwise continue:

Create a Dataframe that will contain the Drug-Disease information:

In [33]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

Read the .txt file downloaded from TTD and convert it into a pandas dataframe:

In [34]:
with open('../../input/P1-05-Drug_disease.txt') as f:
  lines = f.readlines()[22:]

In [35]:
i = 0
id = ''
name = ''
disease = ''
phase = ''

for line in lines: 
  if line.startswith('DRUGNAME'): 
    result = re.search('\t(.*)\n', line)
    name = result.group(1)
  elif line.startswith('TTDDRUID'): 
    result = re.search('\t(.*)\n', line)
    id = result.group(1)
  elif line.startswith('INDICATI'): 
    result = re.search('\t(.*)\[', line)
    result2 = re.search('\](.*)', line)
    disease = result.group(1) 
    phase = result2.group(1) 
    df.loc[i] = [id,name,disease, phase]
    i += 1
  elif line.startswith('\n'): 
    id = ''
    name = ''
    disease = ''
    phase = ''


  result = re.search('\t(.*)\[', line)
  result2 = re.search('\](.*)', line)


In [36]:
df

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE
0,D00ABE,ALD-301,Ischemia,Phase 2
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1
4,D00ACC,ND1251,Depression,Discontinued in Phase 1
...,...,...,...,...
28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2
28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1
28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial
28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2


In [37]:
df.to_csv('output/drug_to_disease.csv', encoding = 'utf-8-sig') 

Run only this cell if you have already saved the Drug-Disease interactions: 

In [38]:
df = pd.read_csv('output/drug_to_disease.csv', header=0, index_col=0)

In [39]:
df

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE
0,D00ABE,ALD-301,Ischemia,Phase 2
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1
4,D00ACC,ND1251,Depression,Discontinued in Phase 1
...,...,...,...,...
28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2
28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1
28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial
28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2


In [40]:
df.to_csv('output/drug_to_disease.csv', encoding = 'utf-8-sig') 

### **Load CSVs**

Load the drug nodes (obtained from Drug Central) and the Monarch nodes: 

In [41]:
graph_drugs = pd.read_csv(f'output/drug_nodes_{DISEASE_PREFIX}.csv', header=0)
nodes = pd.read_csv(f'output/prev_{DISEASE_PREFIX}_monarch_nodes.csv', header=0)

Get list of unique drugs and diseases:

In [42]:
graph_diseases = nodes[nodes['semantic_groups'] == 'DISO']

unique_diseases = graph_diseases['name'].unique()
unique_dis_id = graph_diseases['id'].unique()
unique_diseases = [x.lower() for x in unique_diseases]

print(f'There are {len(unique_diseases)} diseases found in Monarch nodes.')

There are 5313 diseases found in Monarch nodes.


In [43]:
unique_drugs = graph_drugs['DRUG_NAME'].unique()
unique_drugs = [x.lower() for x in unique_drugs]
print(f'There are {len(unique_drugs)} unique drugs found in DrugCentral entries.')

There are 153 unique drugs found in DrugCentral entries.


At this point you should use SORTA tool (https://sorta.molgeniscloud.org/menu/main/sorta?) to match the IDs of TTD to Human Phenotype Ontology (HPO). The file is also available in the project's Github:

In [44]:
matched = pd.read_csv('../../input/matched_phenotypes.csv', header = 0, delimiter = ';')

In [45]:
matched

Unnamed: 0,Name,ontologyTermName,ontologyTermIRI,score,validated,review
0,respiratory failure,Respiratory failure,http://purl.obolibrary.org/obo/HP_0002878,100.00,False,False
1,sexual dysfunction,Male sexual dysfunction,http://purl.obolibrary.org/obo/HP_0040307,86.49,False,False
2,achondroplasia,Bronchodysplasia,http://purl.obolibrary.org/obo/HP_0006533,62.50,False,False
3,glabellar frown line,Prominent glabella,http://purl.obolibrary.org/obo/HP_0002057,59.46,False,False
4,testicular germ cell tumour,Testicular neoplasm,http://purl.obolibrary.org/obo/HP_0010788,71.11,False,False
...,...,...,...,...,...,...
1796,systemic mastocytosis,Mastocytosis,http://purl.obolibrary.org/obo/HP_0100495,77.42,False,False
1797,acute iron or aluminum toxicity,Abnormal total iron binding capacity,http://purl.obolibrary.org/obo/HP_0033212,48.15,False,False
1798,chronic inflammatory demyelinating polyneuropathy,Acute demyelinating polyneuropathy,http://purl.obolibrary.org/obo/HP_0007131,64.86,False,False
1799,aortic aneurysm,Aortic aneurysm,http://purl.obolibrary.org/obo/HP_0004942,100.00,False,False


Select only those IDs with a score greater than 80:

In [46]:
matched = matched[matched['score'] > 80]

Create the final ID:

In [47]:
matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]


In [48]:
new_id = []
for i in matched['ID']: 
  id_value = re.sub("[^0-9a-zA-Z]+", ":", i)
  new_id.append(id_value)
matched['ID'] = new_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['ID'] = new_id


In [49]:
matched.head(10)

Unnamed: 0,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
0,respiratory failure,Respiratory failure,http://purl.obolibrary.org/obo/HP_0002878,100.0,False,False,HP:0002878
1,sexual dysfunction,Male sexual dysfunction,http://purl.obolibrary.org/obo/HP_0040307,86.49,False,False,HP:0040307
6,pollakiuria,Pollakisuria,http://purl.obolibrary.org/obo/HP_0100515,88.0,False,False,HP:0100515
17,dental caries,Carious teeth,http://purl.obolibrary.org/obo/HP_0000670,100.0,False,False,HP:0000670
22,hematopoietic stem cell transplantation,History of bone marrow transplant,http://purl.obolibrary.org/obo/HP_0032557,85.71,False,False,HP:0032557
25,gastritis,Gastritis,http://purl.obolibrary.org/obo/HP_0005263,100.0,False,False,HP:0005263
26,arthralgia,Arthralgia,http://purl.obolibrary.org/obo/HP_0002829,100.0,False,False,HP:0002829
27,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
28,female hypogonadism,Female hypogonadism,http://purl.obolibrary.org/obo/HP_0000134,100.0,False,False,HP:0000134
29,vitamin b1 deficiency,Low levels of vitamin B1,http://purl.obolibrary.org/obo/HP_0100503,100.0,False,False,HP:0100503


### **Merging**

In [50]:
modified = []
for d in df['DISEASES']: 
  new_string = re.sub("[^0-9a-zA-Z]+", " ", d)
  modified.append(new_string.lower())
modified

['ischemia ',
 'peripheral arterial disease ',
 'acute myeloid leukaemia ',
 'hormone deficiency ',
 'depression ',
 'bacterial infection ',
 'solid tumour cancer ',
 'gram positive bacterial infection ',
 'solid tumour cancer ',
 'recurrent glioblastoma ',
 'human immunodeficiency virus infection ',
 'non small cell lung cancer ',
 'metastatic melanoma ',
 'melanoma ',
 'central nervous system disease ',
 'solid tumour cancer ',
 'mycobacterium infection ',
 'neuropathic pain ',
 'non insulin dependent diabetes ',
 'influenza virus infection ',
 'chronic obstructive pulmonary disease ',
 'breast cancer ',
 'liver disease ',
 'asthma ',
 'influenza virus infection ',
 'peripheral vascular disease ',
 'cystitis ',
 'chronic pain ',
 'anemia ',
 'herpes simplex virus infection ',
 'middle east respiratory syndrome mers ',
 'small cell lung cancer ',
 'obesity ',
 'cystic fibrosis ',
 'type 1 diabetes ',
 'type 2 diabetes ',
 'salmonella infection ',
 'hyperlipidaemia ',
 'diagnostic imag

In [51]:
df['Name'] = modified

In [52]:
df['Name'] = df['Name'].str.strip()
matched['Name'] = matched['Name'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['Name'] = matched['Name'].str.strip()


In [53]:
final = pd.merge(df, matched, on = 'Name', how = 'left')

In [54]:
final

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
0,D00ABE,ALD-301,Ischemia,Phase 2,ischemia,,,,,,
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.0,False,False,HP:0004950
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.0,False,False,HP:0004808
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1,hormone deficiency,,,,,,
4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
...,...,...,...,...,...,...,...,...,...,...,...
29562,DZU72C,OKI 179,Solid tumour/cancer,Phase 1,solid tumour cancer,,,,,,
29563,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
29564,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
29565,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358


In [55]:
final.to_csv(f'output/drug_to_disease_HP_{DISEASE_PREFIX}.csv', encoding = 'utf-8-sig') 

### **Comparison**

In [56]:
final = pd.read_csv(f'output/drug_to_disease_HP_{DISEASE_PREFIX}.csv', header=0)
print(f'There are {len(final)} rows of drug-disease pairs that are attempted to match with disease ID')

There are 29567 rows of drug-disease pairs that are attempted to match with disease ID


In [57]:
final_filtered = final[final['score'] > 80]
print(f'There are {len(final_filtered)} rows of drug-disease pairs that are suifficiently matched with disease ID')

There are 14760 rows of drug-disease pairs that are suifficiently matched with disease ID


In [58]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
1,1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.00,False,False,HP:0004950
2,2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.00,False,False,HP:0004808
4,4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.00,False,False,HP:0000716
5,5,D00ACH,HMR-4004,Bacterial infection,Terminated,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,85.71,False,False,HP:0100658
11,11,D00AHT,PRAME antigen-specific cancer immunotherapeutic,Non-small-cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.00,False,False,HP:0030358
...,...,...,...,...,...,...,...,...,...,...,...,...
29560,29560,DZTX12,ASC-J9,Thrombosis,Phase 2,thrombosis,Thrombocytosis,http://purl.obolibrary.org/obo/HP_0001894,83.33,False,False,HP:0001894
29561,29561,DZTX12,ASC-J9,End-stage renal disease,Phase 2,end stage renal disease,Stage 5 chronic kidney disease,http://purl.obolibrary.org/obo/HP_0003774,100.00,False,False,HP:0003774
29563,29563,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.00,False,False,HP:0007018
29564,29564,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.00,False,False,HP:0030358


In [59]:
unique_drugs

['aclarubicin',
 'aldosterone',
 'aminoquinuride',
 'androstenediol',
 'androstenedione',
 'astemizole',
 'bacitracin',
 'beclometasone dipropionate',
 'budesonide',
 'busulfan',
 'candesartan cilexetil',
 'cefonicid',
 'cefotetan',
 'cefotiam',
 'cefuroxime',
 'cefalotin',
 'cefradine',
 'chlorhexidine',
 'chloropyramine',
 'chlorpromazine',
 'clomifene',
 'clotrimazole',
 'cortisone acetate',
 'ciclosporin',
 'dacarbazine',
 'danazol',
 'dasatinib',
 'daunorubicin',
 'prasterone',
 'desogestrel',
 'desoxycortone',
 'dexamethasone',
 'disulfiram',
 'dobutamine',
 'docetaxel',
 'doxorubicin',
 'doxycycline',
 'ebastine',
 'econazole',
 'entacapone',
 'erlotinib',
 'estradiol',
 'estradiol benzoate',
 'estriol succinate',
 'ethinylestradiol',
 'ethisterone',
 'etonogestrel',
 'flopropione',
 'floxacillin',
 'fluphenazine',
 'gefitinib',
 'gestrinone',
 'hexachlorophene',
 'hydrocortisone',
 'imatinib',
 'diiodohydroxyquinoline',
 'lapatinib',
 'levodopa',
 'levonorgestrel',
 'losartan',

In [60]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

df.head(10)

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE


In [61]:
i = 0
for index, row in final_filtered.iterrows(): 
  if row['DRUG_NAME'].lower() not in unique_drugs or row['ID'] not in unique_dis_id:
    final_filtered = final_filtered.drop(labels = index, axis = 0)
   
len(final_filtered)

29

In [62]:
for index, row in final_filtered.iterrows(): 
  drug = row['DRUG_NAME'].lower()
  id_value = graph_drugs[graph_drugs['DRUG_NAME'] == drug]['STRUCT_ID'].values[0]
  final_filtered.at[index, 'DRUG_ID'] = id_value
len(final_filtered)

29

In [63]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
878,878,1932,Nilotinib,Chronic myelogenous leukaemia,Approved,chronic myelogenous leukaemia,Chronic myelogenous leukemia,http://purl.obolibrary.org/obo/HP_0005506,100.0,False,False,HP:0005506
1167,1167,1879,Nandrolone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,100.0,False,False,HP:0000939
1291,1291,542,Cefonicid,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,85.71,False,False,HP:0100658
1945,1945,574,Cefalotin,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,85.71,False,False,HP:0100658
2262,2262,2351,Raloxifene,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,100.0,False,False,HP:0000939
5006,5006,4138,Gentian violet,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,85.71,False,False,HP:0100658
7442,7442,576,Cefradine,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,85.71,False,False,HP:0100658
8028,8028,2607,Testosterone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,100.0,False,False,HP:0000939
8752,8752,4977,Siltuximab,Anemia,Approved,anemia,Anemia,http://purl.obolibrary.org/obo/HP_0001903,100.0,False,False,HP:0001903
9195,9195,960,Doxorubicin,Tumour,Investigative,tumour,Neoplasm,http://purl.obolibrary.org/obo/HP_0002664,100.0,False,False,HP:0002664


In [64]:
final_filtered = final_filtered.drop(labels = ['Unnamed: 0', 'score'], axis =1)

In [65]:
final_filtered.head(10)

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
878,1932,Nilotinib,Chronic myelogenous leukaemia,Approved,chronic myelogenous leukaemia,Chronic myelogenous leukemia,http://purl.obolibrary.org/obo/HP_0005506,False,False,HP:0005506
1167,1879,Nandrolone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
1291,542,Cefonicid,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
1945,574,Cefalotin,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
2262,2351,Raloxifene,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
5006,4138,Gentian violet,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
7442,576,Cefradine,Bacterial infection,Approved,bacterial infection,Cellulitis,http://purl.obolibrary.org/obo/HP_0100658,False,False,HP:0100658
8028,2607,Testosterone,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
8752,4977,Siltuximab,Anemia,Approved,anemia,Anemia,http://purl.obolibrary.org/obo/HP_0001903,False,False,HP:0001903
9195,960,Doxorubicin,Tumour,Investigative,tumour,Neoplasm,http://purl.obolibrary.org/obo/HP_0002664,False,False,HP:0002664


In [66]:
final_filtered.to_csv(f'output/matched_drug_to_disease_{DISEASE_PREFIX}.csv', encoding = 'utf-8-sig') 