In [1]:
import pandas as pd
import numpy as np

## Linked Data Processing 
Code to process linked data including FDA substrates and inhibitors, Drug Interaction Knowledge Base (DIKB) and Drug Central Data. Data have been preprocessed and combined with mappings prior to this notebook.


### FDA, DIKB, Drug Central

In [2]:
df = pd.read_csv('../resources/processed_data/unprocessed_data/staging_combined_new_202308101935.tsv', sep='\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2167 non-null   int64  
 1   chemical_id        2167 non-null   object 
 2   chemical_name      2167 non-null   object 
 3   protein_id         2164 non-null   object 
 4   protein_name       2167 non-null   object 
 5   relation_name      2167 non-null   object 
 6   relation_id        2167 non-null   object 
 7   source             2167 non-null   object 
 8   dikb_id            437 non-null    float64
 9   fda_id             878 non-null    float64
 10  drugcentral_id     852 non-null    float64
 11  reference          2080 non-null   object 
 12  year               2167 non-null   int64  
 13  measurement_type   1160 non-null   object 
 14  measurement_value  1160 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 254.1+ KB


In [4]:
df.head()

Unnamed: 0,id,chemical_id,chemical_name,protein_id,protein_name,relation_name,relation_id,source,dikb_id,fda_id,drugcentral_id,reference,year,measurement_type,measurement_value
0,2397,CHEBI_28901,busulfan,PR_P11509,CYP2A6,inhibits,RO_0002449,FDA,,74.0,,PL,2017,,
1,2398,CHEBI_8426,probenecid,PR_Q9UNQ0,BCRP,molecularly_interacts_with,RO_0002436,FDA,,869.0,,,2017,,
2,359,CHEBI_7936,paroxetine,PR_P10635,cyp2d6,inhibits,RO_0002449,dikb,607.0,,,http://www.ncbi.nlm.nih.gov/pubmed/12173784,2017,,
3,2399,CHEBI_119573,delavirdine,PR_P08684,CYP3A4,inhibits,RO_0002449,FDA,,13.0,,PL,2017,,
4,438,CHEBI_135737,lacidipine,PR_P33261,Cytochrome P450 2C19,is_substrate_of,DIDEO_00000041,drug_central,,,356.0,DRUG MATRIX,2017,IC50,5.0


In [5]:
df = df.dropna(subset=['chemical_id', 'protein_id'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2164 entries, 0 to 2166
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2164 non-null   int64  
 1   chemical_id        2164 non-null   object 
 2   chemical_name      2164 non-null   object 
 3   protein_id         2164 non-null   object 
 4   protein_name       2164 non-null   object 
 5   relation_name      2164 non-null   object 
 6   relation_id        2164 non-null   object 
 7   source             2164 non-null   object 
 8   dikb_id            437 non-null    float64
 9   fda_id             875 non-null    float64
 10  drugcentral_id     852 non-null    float64
 11  reference          2077 non-null   object 
 12  year               2164 non-null   int64  
 13  measurement_type   1159 non-null   object 
 14  measurement_value  1159 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 270.5+ KB


In [7]:
transport = df.loc[df['relation_id'] == 'RO_0002020']

In [8]:
inhibitor = df.loc[df['relation_id'] == 'RO_0002449']
molecule = df.loc[df['relation_id'] == 'RO_0002436']
substrate = df.loc[df['relation_id'] == 'DIDEO_00000041']

In [9]:
df.columns

Index(['id', 'chemical_id', 'chemical_name', 'protein_id', 'protein_name',
       'relation_name', 'relation_id', 'source', 'dikb_id', 'fda_id',
       'drugcentral_id', 'reference', 'year', 'measurement_type',
       'measurement_value'],
      dtype='object')

In [10]:
transport = transport[['chemical_id', 'protein_id']]
transport.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 862 to 1172
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   chemical_id  122 non-null    object
 1   protein_id   122 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


In [11]:
inhibitor = inhibitor[['chemical_id', 'protein_id']]
molecule = molecule[['chemical_id', 'protein_id']]
substrate = substrate[['chemical_id', 'protein_id']]

In [12]:
transport = transport.reset_index(drop=True)
transport.head()

Unnamed: 0,chemical_id,protein_id
0,CHEBI_38545,PR_Q9NPD5
1,CHEBI_8805,PR_P08183
2,CHEBI_5050,PR_O15245
3,CHEBI_70735,PR_P08183
4,CHEBI_82960,PR_P08183


In [13]:
inhibitor = inhibitor.reset_index(drop=True)
molecule = molecule.reset_index(drop=True)
substrate = substrate.reset_index(drop=True)

In [14]:
transport = transport.drop_duplicates()
inhibitor = inhibitor.drop_duplicates()
substrate = substrate.drop_duplicates()
molecule = molecule.drop_duplicates()

In [12]:
processed_data_loc = '../resources/processed_data/'

In [16]:
transport.to_csv(processed_data_loc + 'CHEMICAL_TRANSPORTER.tsv', header=None, sep='\t', index=False)
inhibitor.to_csv(processed_data_loc + 'CHEMICAL_INHIBITOR.tsv', header=None, sep='\t', index=False)
molecule.to_csv(processed_data_loc + 'CHEMICAL_MOLECULE.tsv', header=None, sep='\t', index=False)
substrate.to_csv(processed_data_loc + 'CHEMICAL_SUBSTRATE.tsv', header=None, sep='\t', index=False)

### repoDB

In [17]:
## read in data
repodb = pd.read_csv('../resources/processed_data/unprocessed_data/reposdb_mapped_202402191155.tsv', sep='\t')
repodb.head()

Unnamed: 0,drug_name,drugbank_id,ind_name,ind_id,nct,status,phase,detailedstatus,drug_obo_id,drug_rxnorm_id,ind_obo_id,ind_meddra_pt,ind_snomed_id
0,ephedrine,DB01364,Influenza-like symptoms,C0392171,,Approved,,,CHEBI_15407,3966,,,
1,enoxacin,DB00467,Acute gonococcal cervicitis,C0153195,,Approved,,,CHEBI_157175,3925,MONDO_0001080,,20943002.0
2,alatrofloxacin,DB09335,Haemophilus influenzae pneumonia,C0276026,,Approved,,,CHEBI_135829,141440,,,70036007.0
3,alatrofloxacin,DB09335,Acute gonococcal cervicitis,C0153195,,Approved,,,CHEBI_135829,141440,MONDO_0001080,,20943002.0
4,alatrofloxacin,DB09335,Bacterial infection due to Klebsiella pneumoniae,C0343402,,Approved,,,CHEBI_135829,141440,,,186435004.0


In [18]:
repodb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13558 entries, 0 to 13557
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   drug_name       11623 non-null  object 
 1   drugbank_id     13558 non-null  object 
 2   ind_name        13558 non-null  object 
 3   ind_id          13558 non-null  object 
 4   nct             4627 non-null   object 
 5   status          13558 non-null  object 
 6   phase           4627 non-null   object 
 7   detailedstatus  3872 non-null   object 
 8   drug_obo_id     12342 non-null  object 
 9   drug_rxnorm_id  12933 non-null  object 
 10  ind_obo_id      9186 non-null   object 
 11  ind_meddra_pt   10548 non-null  float64
 12  ind_snomed_id   13104 non-null  float64
dtypes: float64(2), object(11)
memory usage: 1.3+ MB


In [19]:
## drop rows where drug obo id and indication obo id is NA
repodb = repodb.dropna(subset=['drug_obo_id', 'ind_obo_id'])
repodb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8404 entries, 1 to 13552
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   drug_name       7165 non-null   object 
 1   drugbank_id     8404 non-null   object 
 2   ind_name        8404 non-null   object 
 3   ind_id          8404 non-null   object 
 4   nct             2980 non-null   object 
 5   status          8404 non-null   object 
 6   phase           2980 non-null   object 
 7   detailedstatus  2407 non-null   object 
 8   drug_obo_id     8404 non-null   object 
 9   drug_rxnorm_id  7981 non-null   object 
 10  ind_obo_id      8404 non-null   object 
 11  ind_meddra_pt   7431 non-null   float64
 12  ind_snomed_id   8281 non-null   float64
dtypes: float64(2), object(11)
memory usage: 919.2+ KB


In [20]:
repodb = repodb[['drug_obo_id', 'ind_obo_id']]
repodb = repodb.reset_index(drop=True)
repodb.head()

Unnamed: 0,drug_obo_id,ind_obo_id
0,CHEBI_157175,MONDO_0001080
1,CHEBI_135829,MONDO_0001080
2,CHEBI_15407,MONDO_0005709
3,CHEBI_113451,MONDO_0009692
4,CHEBI_29073,MONDO_0009692


In [21]:
##relationship = treats
##replace : with _
def process(text):
    text = text.replace(':', '_')
    return text

In [22]:
repodb['drug_obo_id'] = repodb['drug_obo_id'].apply(process)
repodb.head()

Unnamed: 0,drug_obo_id,ind_obo_id
0,CHEBI_157175,MONDO_0001080
1,CHEBI_135829,MONDO_0001080
2,CHEBI_15407,MONDO_0005709
3,CHEBI_113451,MONDO_0009692
4,CHEBI_29073,MONDO_0009692


In [23]:
repodb['ind_obo_id'] = repodb['ind_obo_id'].apply(process)

In [24]:
repodb = repodb.drop_duplicates()
repodb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6505 entries, 0 to 8403
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   drug_obo_id  6505 non-null   object
 1   ind_obo_id   6505 non-null   object
dtypes: object(2)
memory usage: 152.5+ KB


In [25]:
repodb.to_csv(processed_data_loc+'CHEMICAL_INDICATION.tsv', sep='\t', index=False)

### ONSIDES

In [2]:
##read data
onsides = pd.read_csv('../resources/processed_data/unprocessed_data/onsides_adverse_reactions_with_mappings_20240530.tsv', sep='\t')
onsides.head()

Unnamed: 0,ingredients_rxcuis,ingredients_names,num_ingredients,pt_meddra_id,pt_meddra_term,percent_labels,num_labels,ingredient_obo_uri,ingredient_obo_label,rxnorm_mapping_category,...,pt_snomed_map_id,pt_snomed_map_name,pt_obo_map_uri,pt_obo_map_label,pt_mapping_category,pt_mapping_evidence,pt_obo_uri,pt_obo_label,pt_mapping_source,pt_umls_cui
0,10689,tramadol,1,10000060,Abdominal distension,0.010753,186,CHEBI_9648,tramadol,Automatic One-to-Many Concept,...,,,,,,,HP_0003270,nan|Abdominal distention,umls_cross_ref,C0000731
1,358258,bortezomib,1,10000060,Abdominal distension,0.909091,22,CHEBI_52717,bortezomib,Automatic One-to-One Concept,...,,,,,,,HP_0003270,nan|Abdominal distention,umls_cross_ref,C0000731
2,10473,thiotepa,1,10051017,Staphylococcal bacteraemia,0.444444,9,CHEBI_9570,thiotepa,Automatic One-to-One Concept,...,111821004.0,,,,adr_mapping_cross_ref,,,,,C0152965
3,2564025,belumosudil,1,10051017,Staphylococcal bacteraemia,2.0,1,,,NER,...,111821004.0,,,,adr_mapping_cross_ref,,,,,C0152965
4,10390,tetrabenazine,1,10058818,Skin laceration,0.333333,12,CHEBI_9467,tetrabenazine,Automatic One-to-One Concept,...,83535005.0,,,,adr_mapping_cross_ref,,,,,C0558401


In [4]:
onsides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99239 entries, 0 to 99238
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ingredients_rxcuis       99239 non-null  int64  
 1   ingredients_names        99239 non-null  object 
 2   num_ingredients          99239 non-null  int64  
 3   pt_meddra_id             99239 non-null  int64  
 4   pt_meddra_term           99239 non-null  object 
 5   percent_labels           99239 non-null  float64
 6   num_labels               99239 non-null  int64  
 7   ingredient_obo_uri       89672 non-null  object 
 8   ingredient_obo_label     89672 non-null  object 
 9   rxnorm_mapping_category  99239 non-null  object 
 10  rxnorm_mapping_evidence  87235 non-null  object 
 11  pt_snomed_map_id         271 non-null    float64
 12  pt_snomed_map_name       0 non-null      float64
 13  pt_obo_map_uri           133 non-null    object 
 14  pt_obo_map_label      

In [5]:
## drop rows where drug obo id and indication obo id is NA -- this is the actual number of rows from ONSIDES
onsides = onsides.dropna(subset=['ingredient_obo_uri', 'pt_obo_uri'])
onsides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75483 entries, 0 to 99220
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ingredients_rxcuis       75483 non-null  int64  
 1   ingredients_names        75483 non-null  object 
 2   num_ingredients          75483 non-null  int64  
 3   pt_meddra_id             75483 non-null  int64  
 4   pt_meddra_term           75483 non-null  object 
 5   percent_labels           75483 non-null  float64
 6   num_labels               75483 non-null  int64  
 7   ingredient_obo_uri       75483 non-null  object 
 8   ingredient_obo_label     75483 non-null  object 
 9   rxnorm_mapping_category  75483 non-null  object 
 10  rxnorm_mapping_evidence  73535 non-null  object 
 11  pt_snomed_map_id         134 non-null    float64
 12  pt_snomed_map_name       0 non-null      float64
 13  pt_obo_map_uri           92 non-null     object 
 14  pt_obo_map_label      

In [6]:
onsides = onsides[['ingredient_obo_uri', 'pt_obo_uri']]
onsides = onsides.reset_index(drop=True)
onsides.head()

Unnamed: 0,ingredient_obo_uri,pt_obo_uri
0,CHEBI_9648,HP_0003270
1,CHEBI_52717,HP_0003270
2,CHEBI_63638,HP_0003270
3,CHEBI_75283,MONDO_0001404
4,CHEBI_45783,HP_0003270


In [7]:
##relationship = treats
##replace : with _
def process(text):
    text = text.replace(':', '_')
    return text

In [8]:
onsides['ingredient_obo_uri'] = onsides['ingredient_obo_uri'].apply(process)
onsides['pt_obo_uri'] = onsides['pt_obo_uri'].apply(process)
onsides.head()

Unnamed: 0,ingredient_obo_uri,pt_obo_uri
0,CHEBI_9648,HP_0003270
1,CHEBI_52717,HP_0003270
2,CHEBI_63638,HP_0003270
3,CHEBI_75283,MONDO_0001404
4,CHEBI_45783,HP_0003270


In [9]:
for i in range(len(onsides.index)):
    ing_uri = onsides.at[i, 'ingredient_obo_uri']
    pt_uri = onsides.at[i, 'pt_obo_uri']
    pts_list = pt_uri.split('|')
    onsides.at[i, 'pt_obo_uri'] = pts_list[0]
    if len(pts_list) > 1:
        for pt in pts_list[1:]:
            newrow = {'ingredient_obo_uri': ing_uri, 'pt_obo_uri': pt}
            tempdf = pd.DataFrame(newrow, index=[0])
            onsides = pd.concat([onsides, tempdf], ignore_index=True)
onsides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105175 entries, 0 to 105174
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   ingredient_obo_uri  105175 non-null  object
 1   pt_obo_uri          105175 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [10]:
onsides = onsides.drop_duplicates()
onsides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97720 entries, 0 to 105174
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ingredient_obo_uri  97720 non-null  object
 1   pt_obo_uri          97720 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [13]:
onsides.to_csv(processed_data_loc+'CHEMICAL_ADR.tsv', sep='\t', index=False)