In [1]:
import pandas as pd
import numpy as np

## Linked Data Processing 
Code to process linked data including FDA substrates and inhibitors, Drug Interaction Knowledge Base (DIKB) and Drug Central Data. Data have been preprocessed and combined with mappings prior to this notebook.


### FDA, DIKB, Drug Central

In [2]:
df = pd.read_csv('../resources/processed_data/unprocessed_data/staging_combined_new_202308101935.tsv', sep='\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2167 non-null   int64  
 1   chemical_id        2167 non-null   object 
 2   chemical_name      2167 non-null   object 
 3   protein_id         2164 non-null   object 
 4   protein_name       2167 non-null   object 
 5   relation_name      2167 non-null   object 
 6   relation_id        2167 non-null   object 
 7   source             2167 non-null   object 
 8   dikb_id            437 non-null    float64
 9   fda_id             878 non-null    float64
 10  drugcentral_id     852 non-null    float64
 11  reference          2080 non-null   object 
 12  year               2167 non-null   int64  
 13  measurement_type   1160 non-null   object 
 14  measurement_value  1160 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 254.1+ KB


In [4]:
df.head()

Unnamed: 0,id,chemical_id,chemical_name,protein_id,protein_name,relation_name,relation_id,source,dikb_id,fda_id,drugcentral_id,reference,year,measurement_type,measurement_value
0,2397,CHEBI_28901,busulfan,PR_P11509,CYP2A6,inhibits,RO_0002449,FDA,,74.0,,PL,2017,,
1,2398,CHEBI_8426,probenecid,PR_Q9UNQ0,BCRP,molecularly_interacts_with,RO_0002436,FDA,,869.0,,,2017,,
2,359,CHEBI_7936,paroxetine,PR_P10635,cyp2d6,inhibits,RO_0002449,dikb,607.0,,,http://www.ncbi.nlm.nih.gov/pubmed/12173784,2017,,
3,2399,CHEBI_119573,delavirdine,PR_P08684,CYP3A4,inhibits,RO_0002449,FDA,,13.0,,PL,2017,,
4,438,CHEBI_135737,lacidipine,PR_P33261,Cytochrome P450 2C19,is_substrate_of,DIDEO_00000041,drug_central,,,356.0,DRUG MATRIX,2017,IC50,5.0


In [5]:
df = df.dropna(subset=['chemical_id', 'protein_id'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2164 entries, 0 to 2166
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2164 non-null   int64  
 1   chemical_id        2164 non-null   object 
 2   chemical_name      2164 non-null   object 
 3   protein_id         2164 non-null   object 
 4   protein_name       2164 non-null   object 
 5   relation_name      2164 non-null   object 
 6   relation_id        2164 non-null   object 
 7   source             2164 non-null   object 
 8   dikb_id            437 non-null    float64
 9   fda_id             875 non-null    float64
 10  drugcentral_id     852 non-null    float64
 11  reference          2077 non-null   object 
 12  year               2164 non-null   int64  
 13  measurement_type   1159 non-null   object 
 14  measurement_value  1159 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 270.5+ KB


In [7]:
transport = df.loc[df['relation_id'] == 'RO_0002020']

In [8]:
inhibitor = df.loc[df['relation_id'] == 'RO_0002449']
molecule = df.loc[df['relation_id'] == 'RO_0002436']
substrate = df.loc[df['relation_id'] == 'DIDEO_00000041']

In [9]:
df.columns

Index(['id', 'chemical_id', 'chemical_name', 'protein_id', 'protein_name',
       'relation_name', 'relation_id', 'source', 'dikb_id', 'fda_id',
       'drugcentral_id', 'reference', 'year', 'measurement_type',
       'measurement_value'],
      dtype='object')

In [10]:
transport = transport[['chemical_id', 'protein_id']]
transport.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 862 to 1172
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   chemical_id  122 non-null    object
 1   protein_id   122 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


In [11]:
inhibitor = inhibitor[['chemical_id', 'protein_id']]
molecule = molecule[['chemical_id', 'protein_id']]
substrate = substrate[['chemical_id', 'protein_id']]

In [12]:
transport = transport.reset_index(drop=True)
transport.head()

Unnamed: 0,chemical_id,protein_id
0,CHEBI_38545,PR_Q9NPD5
1,CHEBI_8805,PR_P08183
2,CHEBI_5050,PR_O15245
3,CHEBI_70735,PR_P08183
4,CHEBI_82960,PR_P08183


In [13]:
inhibitor = inhibitor.reset_index(drop=True)
molecule = molecule.reset_index(drop=True)
substrate = substrate.reset_index(drop=True)

In [14]:
transport = transport.drop_duplicates()
inhibitor = inhibitor.drop_duplicates()
substrate = substrate.drop_duplicates()
molecule = molecule.drop_duplicates()

In [15]:
processed_data_loc = '../resources/processed_data/'

In [16]:
transport.to_csv(processed_data_loc + 'CHEMICAL_TRANSPORTER.tsv', header=None, sep='\t', index=False)
inhibitor.to_csv(processed_data_loc + 'CHEMICAL_INHIBITOR.tsv', header=None, sep='\t', index=False)
molecule.to_csv(processed_data_loc + 'CHEMICAL_MOLECULE.tsv', header=None, sep='\t', index=False)
substrate.to_csv(processed_data_loc + 'CHEMICAL_SUBSTRATE.tsv', header=None, sep='\t', index=False)

### repoDB

In [17]:
## read in data
repodb = pd.read_csv('../resources/processed_data/unprocessed_data/repodb_approved_mapped_20230804.tsv', sep='\t')
repodb.head()

Unnamed: 0,drug_name,drug_id,ind_name,ind_id,nct,status,phase,detailedstatus,drug_obo_id,ind_obo_id
0,Lepirudin,DB00001,Heparin-induced thrombocytopenia with thrombosis,C0272275,,Approved,,,CHEBI:142437,
1,Cetuximab,DB00002,Squamous cell carcinoma of mouth,C0585362,,Approved,,,,MONDO:0004958
2,Cetuximab,DB00002,Squamous cell carcinoma of nose,C3163899,,Approved,,,,
3,Cetuximab,DB00002,Squamous cell carcinoma of pharynx,C1319317,,Approved,,,,MONDO:0000536
4,Cetuximab,DB00002,Laryngeal Squamous Cell Carcinoma,C0280324,,Approved,,,,MONDO:0005595


In [18]:
repodb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6677 entries, 0 to 6676
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   drug_name       6677 non-null   object 
 1   drug_id         6677 non-null   object 
 2   ind_name        6677 non-null   object 
 3   ind_id          6677 non-null   object 
 4   nct             0 non-null      float64
 5   status          6677 non-null   object 
 6   phase           0 non-null      float64
 7   detailedstatus  0 non-null      float64
 8   drug_obo_id     6299 non-null   object 
 9   ind_obo_id      4296 non-null   object 
dtypes: float64(3), object(7)
memory usage: 521.8+ KB


In [20]:
## drop rows where drug obo id and indication obo id is NA
repodb = repodb.dropna(subset=['drug_obo_id', 'ind_obo_id'])
repodb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4017 entries, 7 to 6676
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   drug_name       4017 non-null   object 
 1   drug_id         4017 non-null   object 
 2   ind_name        4017 non-null   object 
 3   ind_id          4017 non-null   object 
 4   nct             0 non-null      float64
 5   status          4017 non-null   object 
 6   phase           0 non-null      float64
 7   detailedstatus  0 non-null      float64
 8   drug_obo_id     4017 non-null   object 
 9   ind_obo_id      4017 non-null   object 
dtypes: float64(3), object(7)
memory usage: 345.2+ KB


In [21]:
repodb = repodb[['drug_obo_id', 'ind_obo_id']]
repodb = repodb.reset_index(drop=True)
repodb.head()

Unnamed: 0,drug_obo_id,ind_obo_id
0,CHEBI:4875,MONDO:0008383
1,CHEBI:4875,MONDO:0011849
2,CHEBI:6427,MONDO:0000088
3,CHEBI:9118,MONDO:0006909
4,CHEBI:3306,MONDO:0008159


In [22]:
##relationship = treats
##replace : with _
def process(text):
    text = text.replace(':', '_')
    return text

In [23]:
repodb['drug_obo_id'] = repodb['drug_obo_id'].apply(process)
repodb.head()

Unnamed: 0,drug_obo_id,ind_obo_id
0,CHEBI_4875,MONDO:0008383
1,CHEBI_4875,MONDO:0011849
2,CHEBI_6427,MONDO:0000088
3,CHEBI_9118,MONDO:0006909
4,CHEBI_3306,MONDO:0008159


In [24]:
repodb['ind_obo_id'] = repodb['ind_obo_id'].apply(process)


In [25]:
repodb = repodb.drop_duplicates()
repodb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3996 entries, 0 to 4016
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   drug_obo_id  3996 non-null   object
 1   ind_obo_id   3996 non-null   object
dtypes: object(2)
memory usage: 93.7+ KB


In [None]:
repodb.to_csv(processed_data_loc+'CHEMICAL_INDICATION.tsv', sep='\t', index=False)

### MEDLINE

In [None]:
#TO DO

### SPLICER

In [None]:
#TO DO