# DrugBank Processing

In [None]:
class Drug:
    """
    docstring for Drug.
    """
    def __init__(self, features):

        self.id = features['id']
        self.name = features['name']
        self.synonyms = features['synm']
        self.kingdom = features['kgd']
        self.superclass = features['sclass']
        self.interaction = features['itrc']
        self.external_id = features['ext_id']
        self.pathways = features['pathways']
        self.target = []

    def getDrugfeatures(self):
        drug_dict = {"dg_id":self.id,
                    "dg_name":self.name,
                    "dg_synm":self.synonyms,
                    "dg_kingdom":self.kingdom,
                    "dg_superclass":self.superclass,
                    "dg_interactions":self.interaction,
                    "dg_ext_id":self.external_id,
                    "dg_pathways":self.pathways}
        return drug_dict

    def addTarget(self, feature_target):
        self.target.append(feature_target)

# Parameters and required variables #

dB_file = 'C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/DrugBank/DrugBank_full_db.xml'
organism = 'Humans'
saveFile = 'C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/DrugBank/drugBank_v515_targetExtracted.csv'


In [None]:
import xml.etree.ElementTree as ET
import time
from tqdm import tqdm
import pandas as pd

xtree = ET.parse(dB_file)
xroot = xtree.getroot()
drugs = list(xroot)

drug_targets = []
for i in tqdm(range(len(drugs))):
    drug = drugs[i]
    idDB = drug[0].text # Drug Bank ID

    for idx,feature in enumerate(drug):
        if 'name' in str(feature): # drug name
            drug_name = drug[idx].text

        if 'synonyms' in str(feature): # drug's synonyms
            drug_synm = ';'.join([synm.text \
                                    for synm in list(drug[idx])])

        if 'classification' in str(feature): #type of drug
            drug_class_kingdom = list(drug[idx])[2].text
            drug_class_superclass = list(drug[idx])[3].text

        if 'drug-interactions' in str(feature): #interaction other drugs
            drug_interaction = ';'.join([di[0].text
                                        for di in list(drug[idx])])

        if 'external-identifiers' in str(feature): #other drug's IDs
            aux = [ext_id[0].text + ":" + ext_id[1].text \
                                        for ext_id in list(drug[idx])]
            drug_external_id = ';'.join(aux)

        if 'pathways' in str(feature): #related pathways
            drug_pathway = ';'.join([pathway[1].text \
                                    for pathway in list(drug[idx])])

        if 'targets' in str(feature): #if polypeptide, drug's targets
            targets = list(drug[idx])

    # get all drug-related information in a dictionary
    drug_dict = {"id":idDB,
                "name":drug_name,
                "synm":drug_synm,
                "kgd":drug_class_kingdom,
                "sclass":drug_class_superclass,
                "itrc":drug_interaction,
                "ext_id":drug_external_id,
                "pathways":drug_pathway}
    drug = Drug(drug_dict)

    # get information of polypeptide targets
    if len(targets) > 0:
        for target in targets:
            idx_pep = None
            # get indexes
            for idx,feature in enumerate(target): # check features of targets
                if 'organism' in str(feature):
                    idx_org = idx
                if 'name' in str(feature):
                    idx_name = idx
                if 'actions' in str(feature):
                    idx_act = idx
                if 'polypeptide' in str(feature):
                    idx_pep = idx

            # Get information for polypeptide
            if target[idx_org].text == organism:

                target_name = target[idx_name].text

                actions = ';'.join([action.text
                                    for action in list(target[idx_act])])

                # Get information for polypeptide
                if idx_pep is not None: #if there is polypeptide's info...
                    for idx,feature in enumerate(target[idx_pep]):
                        if 'gene-name' in str(feature):
                            gene_name = target[idx_pep][idx].text
                        if 'cellular-location' in str(feature):
                            cell_loc = target[idx_pep][idx].text
                        if 'external-identifiers' in str(feature):
                            for ext_id in list(target[idx_pep][idx]):
                                if ext_id[0].text == "UniProtKB":
                                    uniprot = ext_id[1].text
                else:
                    gene_name = None
                    action = None
                    cell_loc = None
                    uniprot = None

                row = {
                        "dg_id":drug.id,
                        "dg_name":drug.name,
                        "dg_synm":drug.synonyms,
                        "dg_kingdom":drug.kingdom,
                        "dg_superclass":drug.superclass,
                        "dg_interactions":drug.interaction,
                        "dg_ext_id":drug.external_id,
                        "dg_pathways":drug.pathways,
                        "target_name":target_name,
                        "target_uniprot":uniprot,
                        "target_gene_name":gene_name,
                        "action":actions,
                        "cell_loc":cell_loc,
                        }

                drug_targets.append(row)


dt = pd.DataFrame.from_dict(drug_targets, orient='columns')
dt.shape
dt.to_csv(saveFile)

100%|██████████| 17430/17430 [00:15<00:00, 1092.57it/s]


Filtering DrugBank drugs to only include drugs that act as inhibitors and whose targets are present in the human Liver GSMM

In [None]:
import pandas as pd

# Load the original CSV file
df1 = pd.read_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/DrugBank/drugBank_v515_targetExtracted.csv')
df2 = pd.read_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/GSMMs/gene_descriptions.csv')
df3 = pd.read_excel('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/GSMMs/Mammalian_Metabolic_Final.xlsx')
# Get unique gene descriptions
unique_gene_descs = df2['genes_desc'].unique()

# Filter the DataFrame to include only rows where 'action' is 'inhibitor'
# Filter the data based on the presence of target_gene in gene_desc
#filtered_data = df1[(df1['action']=='inhibitor') & (df1['target_gene_name'].isin(unique_gene_descs))]
filtered_data_meta=df1[(df1['action']=='inhibitor') & (df1['target_gene_name'].isin(unique_gene_descs)) & (df1['target_gene_name'].str.lower()).isin(df3['Gene Symbol'].str.lower())]

filtered_data_meta.head()
# Save the filtered DataFrame to a new CSV file
# Save the filtered data to a new CSV file
#filtered_data.to_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/DrugBank/drugbank_inhibitors_gsmm.csv', index=False)
filtered_data_meta.to_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/DrugBank/drugbank_inhibitors_gsmm_meta.csv', index=False)



# SIDER2 Processing

Converting PubChem CIDs to DrugBank IDs for SIDER2

In [None]:
import pandas as pd
import requests
pubchem_to_dg_id_dict={}
dg_ids=[]
data = pd.read_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/SIDER/sider2-processed.txt', sep='\t')
pubchem_cids=data['pubchem_cid'].unique()
for i in pubchem_cids:
    
    try:
        # Construct the API query
        url = f"https://mychem.info/v1/chem/{i}"
        
        # Send the request
        response = requests.get(url)
        
        # Parse the response
        if response.status_code == 200:
            data_response = response.json()
            # Check if DrugBank ID is available
            try:
                drugbank_id=data_response['drugbank']['id']
            except:
                drugbank_id=None
        elif drugbank_id==None:
            try:
                drugbank_id=data_response['chebi']['xrefs']['drugbank']
            except:
                drugbank_id=None
        elif drugbank_id==None:
            try:
                drugbank_id=data_response['unichem']['drugbank']
            except:
                drugbank_id=None
        elif drugbank_id==None:
            try:
                drugbank_id=data_response['pharmgkb']['xrefs']['drugbank']
            except:
                drugbank_id=None
        elif drugbank_id==None:
            try:
                drugbank_id=data_response['pharmgkb']['xrefs']['drugbank']
            except:
                drugbank_id=None
        else:
            drugbank_id = None

    except Exception as e:
        print(f"Error occurred for CID {i}: {e}")
        drugbank_id = None
    pubchem_to_dg_id_dict[str(i)]=drugbank_id


Writing a file with pubchem cid to drugbank id translations

In [None]:
#adding dg_ids to sider database to allow processing with drugbank db
dg_ids=[]
for i in data['pubchem_cid']:
    dg_ids.append(pubchem_to_dg_id_dict[str(i)])
new_df=data.copy()
new_df['dg_id']=dg_ids
new_df.to_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/SIDER/sider2.csv', index=False)

#writing cid:dg_id mapping for future reference
df_tocsv = pd.DataFrame(list(pubchem_to_dg_id_dict.items()), columns=['PubChem CID', 'DrugBank ID'])
df_tocsv.to_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/SIDER/pubchem_cid_to_dg_id.csv', index=False)



Filtering sider2 to only include 'side_effect' records: removing the 'indication' records

In [None]:
df= pd.read_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/SIDER/sider2.csv')
filtered_df=df[df['type']=='side_effect']
filtered_df=filtered_df[filtered_df['dg_id'].notna()]
filtered_df.to_csv('C:/Users/Saathvik/Documents/MATLAB/Project_Liver/data/SIDER/sider2_processed.csv', index=False)