In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import xml.etree.ElementTree as ET
import numpy as np
from collections import defaultdict

# Function to assign custom IDs
def assign_doe_ids(df):
    """
    Assigns a unique doe_id to rows that share any identifier values.
    Uses a disjoint set (union-find) implementation to handle transitive relationships.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'chembl_id', 'drugbank_id', 'cid', and 'name'
    
    Returns:
    pd.Series: Series containing the assigned doe_ids
    """
    class DisjointSet:
        def __init__(self):
            self.parent = {}
            self.rank = defaultdict(int)
            
        def make_set(self, x):
            if x not in self.parent:
                self.parent[x] = x
                
        def find(self, x):
            # If x is not in any set yet, create a new set
            if x not in self.parent:
                self.make_set(x)
            
            # Path compression: make each node point directly to root
            if self.parent[x] != x:
                self.parent[x] = self.find(self.parent[x])
            return self.parent[x]
        
        def union(self, x, y):
            root_x = self.find(x)
            root_y = self.find(y)
            
            if root_x != root_y:
                # Union by rank: attach smaller rank tree under root of higher rank tree
                if self.rank[root_x] < self.rank[root_y]:
                    self.parent[root_x] = root_y
                elif self.rank[root_x] > self.rank[root_y]:
                    self.parent[root_y] = root_x
                else:
                    self.parent[root_y] = root_x
                    self.rank[root_x] += 1

    # Initialize disjoint set
    ds = DisjointSet()
    
    # Create a mapping from identifier values to row indices
    id_to_rows = defaultdict(list)
    id_columns = ['chembl_id', 'drugbank_id', 'cid', 'name']
    
    # Populate id_to_rows mapping
    for idx, row in df.iterrows():
        for col in id_columns:
            if pd.notna(row[col]):
                id_to_rows[f"{col}_{row[col]}"].append(idx)
    
    # Union rows that share identifiers
    for id_vals in id_to_rows.values():
        for i in range(len(id_vals) - 1):
            ds.union(id_vals[0], id_vals[i + 1])
    
    # Create doe_ids
    root_to_doe_id = {}
    doe_ids = []
    current_doe_id = 0
    
    # Assign doe_ids based on disjoint set roots
    for idx in range(len(df)):
        root = ds.find(idx)
        if root not in root_to_doe_id:
            root_to_doe_id[root] = current_doe_id
            current_doe_id += 1
        doe_ids.append(root_to_doe_id[root])
    
    return pd.Series(doe_ids, index=df.index, name='doe_id')

# See "Diseases.ipynb" for these files
custom = pd.read_excel('./Phenotyping/custom_phenotypes.xlsx')
custom['phecode1.2'] = custom['phecode1.2'].astype(float)
icd = pd.read_csv('./Phenotyping/icd_codes.csv')
merged_pheno = pd.read_excel('./Phenotyping/merged_phenotypes.xlsx')


## Phenotying

In [None]:
p12 = pd.read_csv('./Phenotyping/phecode1.2_map.csv')
p12 = p12.loc[p12['Flag'] == 10]
p12['ICD_3c'] = p12['ICD'].str[:3]
keep = p12.loc[p12['ICD_3c'].isin(icd['ICD'])]['Phecode']
p12 = p12.loc[p12['Phecode'].isin(keep)]
keep = p12.groupby('Phecode')['ICD_3c'].nunique().reset_index()
keep = keep.loc[keep['ICD_3c'] == 1]['Phecode']
p12 = p12.loc[p12['Phecode'].isin(keep)]
p12['Phecode'] = p12['Phecode'].astype(float)
p12 = p12[['Phecode','ICD_3c']].drop_duplicates().set_axis(['phecode1.2','Code'],axis=1)
p12_icd = p12.loc[p12['Code'].isin(merged_pheno['Code'])]

p12_custom = custom[['phecode1.2','custom_code']].dropna().drop_duplicates().rename({'custom_code':'Code'},axis=1)

p12 = pd.read_csv('./Phenotyping/phecode1.2_map.csv')
p12['Phecode'] = p12['Phecode'].astype(float)
p12 = p12[['Phecode']].drop_duplicates()
p12['phecode1.2'] = p12['Phecode'].astype(str).str.split('.').str[0].astype(float)
p12 = p12.merge(custom[['phecode1.2','custom_code']].dropna().drop_duplicates())
p12_custom_extended = p12[['Phecode','custom_code']].set_axis(['phecode1.2','Code'],axis=1)

pcc = pd.concat([p12_icd,p12_custom,p12_custom_extended]).reset_index(drop=True).drop_duplicates()
pcc.to_excel('./Phenotyping/phecode_to_code.xlsx', index=False)


## Target data

In [None]:
# Turn DrugBank data into dataframe
tree = ET.parse('./DrugBank/full database.xml') # Requires registration to download; please download yourself from https://go.drugbank.com/releases/latest
root = tree.getroot()

ns = {'ns': 'http://www.drugbank.ca'}

dn = []
tn = []
gn = []
an = []
organisms = []
chembl_ids = []
pubchem_ids = []
drug_types = []
clinical_phases = []
approval_statuses = []

for drug in root.findall('.//ns:drug', ns): 
    drugbank_id = drug.find('.//ns:drugbank-id', ns).text
    chembl_id = None
    pubchem_id = None
    drug_type = drug.get('type')  # Extract the drug type attribute
    clinical_phase = None
    approval_status = None
    
    for external_id in drug.findall('.//ns:external-identifier', ns):
        resource = external_id.find('.//ns:resource', ns).text
        identifier = external_id.find('.//ns:identifier', ns).text
        if resource == "ChEMBL":
            chembl_id = identifier
        elif resource == "PubChem Compound":
            pubchem_id = identifier

    approval_status_elem = drug.find('.//ns:groups/ns:group', ns)
    if approval_status_elem is not None:
        approval_status = approval_status_elem.text
    
    for target in drug.findall('.//ns:target', ns):
        target_name = target.find('.//ns:name', ns).text if target.find('.//ns:name', ns) is not None else None
        gene = target.find('.//ns:gene-name', ns).text if target.find('.//ns:gene-name', ns) is not None else None
        action = target.find('.//ns:action', ns).text if target.find('.//ns:action', ns) is not None else None
        organism = target.find('.//ns:organism', ns).text if target.find('.//ns:organism', ns) is not None else None
        
        dn.append(drugbank_id)
        tn.append(target_name)
        gn.append(gene)
        an.append(action)
        organisms.append(organism)
        chembl_ids.append(chembl_id)
        pubchem_ids.append(pubchem_id)
        drug_types.append(drug_type)
        clinical_phases.append(clinical_phase)
        approval_statuses.append(approval_status)

db_moa = pd.DataFrame({
    'Drugbank': dn,
    'Drug Type': drug_types,
    'Target': tn,
    'Gene': gn,
    'Action': an,
    'Organism': organisms,
    'ChEMBL': chembl_ids,
    'PubChem Compound': pubchem_ids,
    'Approval Status': approval_statuses
})

db_moa['Phase'] = db_moa['Approval Status'].map({'approved':4,'withdrawn':-1})
db_moa.to_pickle('./DrugBank/drugbank_moa.pkl')


In [2]:
# ChEMBL 35
# https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/schema_documentation.html
# Table storing indications for drugs, and clinical candidate drugs, from a variety of sources (e.g., FDA, EMA, WHO ATC, ClinicalTrials.gov, INN, USAN).
ct = pd.read_csv('./ChEMBL/chembl_35_drug_targets.csv') # Obtained via SQL query (see ./ChEMBL)
ct = ct.loc[(ct['syn_type'] == 'GENE_SYMBOL') & (ct['organism'] == 'Homo sapiens')]
ct['moa'] = 'Other'
ct.loc[ct['action_type'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER', 'RNAI INHIBITOR']),'moa'] = 'Negative modulator'
ct.loc[ct['action_type'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'Positive modulator'
ct = ct[['chembl_id','molecule_type','moa','action_type','component_synonym','pref_name']].rename({'molecule_type':'type','component_synonym':'gene','pref_name':'name','action_type':'action'},axis=1)
phase = pd.read_csv('./ChEMBL/chembl_35_phase.csv')[['chembl_id','max_clinical_trial_phase']].set_axis(['chembl_id','phase'],axis=1).dropna()
ct = ct.merge(phase, how='left')
ct['name'] = ct['name'].str.upper()
ct['source'] = '1. CHEMBL'
ct['priority'] = 1

###

# https://github.com/opentargets/issues/issues/533
moa = pd.read_pickle('./OT/Raw/mechanismOfAction.pkl')
moa['moa'] = 'Other'
moa.loc[moa['actionType'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER', 'RNAI INHIBITOR']),'moa'] = 'Negative modulator'
moa.loc[moa['actionType'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'Positive modulator'
moa = moa[['moa','chemblIds','targets','actionType']].rename({'actionType':'action'},axis=1)
moa = moa.explode('chemblIds')
moa = moa.explode('targets')
moa = moa.merge(pd.read_pickle('./OT/Final/targets.pkl').rename({'gene_id':'targets'},axis=1))
moa = moa.rename({'chemblIds':'chembl_id'},axis=1).drop_duplicates()
phase = pd.read_pickle('./OT/Final/indication_all.pkl').groupby('id')['phase'].max().reset_index().dropna().rename({'id':'chembl_id'},axis=1)
moa = moa.merge(phase, how='left')
moa['source'] = '1. CHEMBL' # OT data is derived from CHEMBL
moa['priority'] = 2

###

# Drugbank 5.1.13
dpa = pd.read_pickle('./DrugBank/drugbank_moa.pkl')
dpa = dpa.loc[dpa['Organism'] == 'Humans']
phase = dpa.groupby('Drugbank')['Phase'].max().reset_index().set_axis(['drugbank_id','phase'],axis=1)
dpa['Action'] = dpa['Action'].str.upper()
dpa['moa'] = 'Other'
dpa.loc[dpa['Action'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                            'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                            'NEGATIVE MODULATOR', 'DEGRADER', 'RNAI INHIBITOR', 'ANTISENSE OLIGONUCLEOTIDE',
                            'DOWNREGULATOR','INACTIVATOR','INHIBITORY ALLOSTERIC MODULATOR',
                            'SUPPRESSOR','NEUTRALIZER','WEAK INHIBITOR','PARTIAL ANTAGONIST',
                            'TRANSLOCATION INHIBITOR','DEGRADATION','NUCLEOTIDE EXCHANGE BLOCKER',
                            'INHIBITION OF SYNTHESIS']),'moa'] = 'Negative modulator'
dpa.loc[dpa['Action'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                            'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                            'PARTIAL AGONIST','INDUCER','POTENTIATOR','STIMULATOR',
                            'UPREGULATOR']),'moa'] = 'Positive modulator'
dpa = dpa[['Drugbank','Drug Type','ChEMBL','PubChem Compound','moa','Action','Gene']].set_axis(['drugbank_id','type','chembl_id','cid','moa','action','gene'],axis=1).drop_duplicates()
dpa = dpa.loc[dpa['gene'].notna()]
dpa = dpa.merge(phase, how='left')
dpa['action'] = dpa['action'].fillna('UNKNOWN')
dpa['source'] = '2. DrugBank'
dpa['priority'] = 1

###

# Guide to PHARMACOLOGY 2024.4
# Only approved drugs or drugs in ChEMBL/DrugBank
# Only primary targets as non-primary targets are too broad
gtp = pd.read_csv('./Other/gtp_2024.4.csv', skiprows=1)
gtp = gtp.loc[gtp['Target Species'] == 'Human']
gtp = gtp.loc[gtp['Primary Target'] == True]
gtp = gtp[['Target Ensembl Gene ID','Ligand','Ligand ID',
           'Approved','Type','Action']].dropna(subset=['Target Ensembl Gene ID']).drop_duplicates()
gtp = gtp.set_axis(['gene_id','name','gtp_id','approved','type','action'],axis=1)
gtp = gtp.merge(pd.read_pickle('./OT/Final/targets.pkl')).drop('gene_id',axis=1)
gtp['action'] = gtp['action'].str.upper()
gtp['type'] = gtp['type'].str.upper()
gtp['moa'] = 'Other'
gtp.loc[gtp['action'].str.upper().isin(['INHIBITION','ANTAGONIST','INVERSE AGONIST','NEGATIVE',
                            'PORE BLOCKER','COMPETITIVE','IRREVERSIBLE INHIBITION']),'moa'] = 'Negative modulator'
gtp.loc[gtp['action'].str.upper().isin(['AGONIST','FULL AGONIST','PARTIAL AGONIST','POSITIVE',
                            'ACTIVATION','POTENTIATION']),'moa'] = 'Positive modulator'
gtp.loc[gtp['type'].isin(['INHIBITOR','ANTAGONIST','CHANNEL BLOCKER','GATING INHIBITOR']),'moa'] = 'Negative modulator'
gtp.loc[gtp['type'].isin(['AGONIST','ACTIVATOR']),'moa'] = 'Positive modulator'
gtp.loc[gtp['type'].isin(['ANTIBODY']),'action'] = 'ANTIBODY'
gtp['action'] = gtp['action'].fillna(gtp['type']).fillna('UNKNOWN')
gtp = gtp[['gene','gtp_id','action','moa','approved']].drop_duplicates()

gn = pd.read_csv('./Other/gtp_ligand_id_mapping.csv', skiprows=1)[['Ligand id','Name','PubChem CID','ChEMBl ID','DrugBank ID','Type']].set_axis(['gtp_id','name','cid','chembl_id','drugbank_id','type'],axis=1).drop_duplicates()
gn = gn.loc[(gn['gtp_id'].isin(gtp.loc[gtp['approved'] == True]['gtp_id'])) | 
            (gn['drugbank_id'].isin(dpa['drugbank_id'])) |
            (gn['chembl_id'].isin(ct['chembl_id'])) |
            (gn['chembl_id'].isin(moa['chembl_id']))]
gtp = gtp.merge(gn, on='gtp_id').drop('gtp_id',axis=1).drop_duplicates()
gtp['name'] = gtp['name'].str.upper()
gtp['phase'] = 4
gtp['source'] = '3. GTP'
gtp['priority'] = 1

###

# Santos et al.'s manual curation
ndr = pd.read_excel('./Other/santos_nrdd_2016_st2.xlsx')
ndr = ndr.loc[ndr['ORGANISM'] == 'Homo sapiens']
ndr = ndr.dropna(subset=['MECHANISM_OF_ACTION','TARGET_PREF_NAME','ACCESSION'])
ndr["ACTION"] = ndr.apply(
    lambda row: row["MECHANISM_OF_ACTION"].replace(row["TARGET_PREF_NAME"], "").strip()
    if row["TARGET_PREF_NAME"] in row["MECHANISM_OF_ACTION"] else row["MECHANISM_OF_ACTION"],
    axis=1
)
ndr.loc[ndr['ACTION'].str.contains('CDK4|PARP|DIOI'), 'ACTION'] = 'inhibitor'
ndr['ACTION'] = ndr['ACTION'].str.upper()
ndr = ndr.loc[ndr['ACTION'] != ''][['PARENT_PREF_NAME','ACTION','ACCESSION']]

ndr['moa'] = 'Other'
ndr.loc[ndr['ACTION'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER', 'RNAI INHIBITOR','DNA INHIBITOR']),'moa'] = 'Negative modulator'
ndr.loc[ndr['ACTION'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'Positive modulator'

hgnc = pd.read_csv('./Other/hgnc_genes.txt', sep='\t')
hgnc = hgnc.sort_values(['HGNC ID','Status']).drop_duplicates('HGNC ID')
hgnc = hgnc[['HGNC ID','Approved symbol']].rename({'Approved symbol':'Gene'},axis=1)
conv = pd.read_csv('./Other/ndr_hgnc.tsv', sep='\t').set_axis(['ACCESSION','HGNC ID'],axis=1)
conv = conv.merge(hgnc)[['ACCESSION', 'Gene']]
ndr = ndr.merge(conv)
ndr = ndr[['PARENT_PREF_NAME','ACTION','moa','Gene']].set_axis(['name','action','moa','gene'],axis=1).drop_duplicates()

dnl = pd.read_csv('./Drugs/Conversion/drug links.csv').rename({'DrugBank ID':'drugbank_id','Name':'name'},axis=1)[['name','drugbank_id']]
dnl['name'] = dnl['name'].str.upper()
ndr = ndr.merge(dnl, how='left')
ctn = pd.read_csv('./ChEMBL/chembl_35_drug_targets.csv')[['pref_name','chembl_id']].rename({'pref_name':'name'},axis=1)
ndr = ndr.merge(ctn, how='left').drop_duplicates()
ndr['phase'] = 4
ndr['source'] = '1. CHEMBL'
ndr['priority'] = 3 # NDR data is primarily derived from CHEMBL; lower priority than CHEMBL35 or OT as it is the oldest

###

adg = pd.concat([moa,ct,dpa,gtp,ndr]).reset_index(drop=True).drop(['targets'],axis=1).drop_duplicates()
adg['source'] = adg['source'].str.split('. ').str[1]
maxphase = adg.groupby('gene')['phase'].max().reset_index()
adg = adg.groupby('gene')['source'].unique().reset_index().merge(maxphase, how='left')
adg.to_pickle('./Drugs/drug_genes.pkl')


  gtp = pd.read_csv('./Other/gtp_2024.4.csv', skiprows=1)


In [None]:
targets = pd.concat([ct,moa,dpa,gtp,ndr]).reset_index(drop=True).drop(['targets'],axis=1)
targets['name'] = targets['name'].str.upper()
chembl_phase = targets.groupby('chembl_id')['phase'].max().reset_index()
drugbank_phase = targets.groupby('drugbank_id')['phase'].max().reset_index()
cid_phase = targets.groupby('cid')['phase'].max().reset_index()
cid_phase['cid'] = cid_phase['cid'].astype(float)
targets = targets.drop('phase',axis=1).drop_duplicates()

# Fixing names
targets.loc[targets['name'] == 'NO', 'name'] = 'NITRIC OXIDE'
targets.loc[targets['name'] == 'LH', 'name'] = 'LUTROPIN ALFA'
targets.loc[targets['name'] == 'LI<SUP>+</SUP>', 'name'] = 'LITHIUM CITRATE'

# Filling in missing ChEMBL IDs by name
cs = pd.read_csv('./ChEMBL/chembl_35_drug_targets.csv')[['molregno','chembl_id']]
cs = cs.merge(pd.read_csv('./ChEMBL/chembl_35_molecule_synonyms.csv')[['molregno','syn_type','synonyms']]).drop_duplicates().rename({'synonyms':'name'},axis=1)
cs['name'] = cs['name'].str.upper()
name_chembl_map = cs[['name','chembl_id']].dropna().drop_duplicates().rename({'chembl_id':'filler'},axis=1)
targets = targets.merge(name_chembl_map, how='left')
targets['chembl_id'] = targets['chembl_id'].fillna(targets['filler'])
targets = targets.drop('filler',axis=1)

# Manualy mapped name to ChEMBL ID
noid = pd.read_excel('./Drugs/Conversion/noid.xlsx')
name_chembl_map = noid[['name','chembl_id']].dropna().drop_duplicates().rename({'chembl_id':'filler'},axis=1)
targets = targets.merge(name_chembl_map, how='left')
targets['chembl_id'] = targets['chembl_id'].fillna(targets['filler'])
targets = targets.drop('filler',axis=1)

# Filling in missing ChEMBL and DrugBank IDs where at least one is available
chembl_drugbank_map = targets[['chembl_id', 'drugbank_id']].dropna().drop_duplicates().rename({'drugbank_id': 'filler'}, axis=1)
targets = targets.merge(chembl_drugbank_map, how='left', on='chembl_id')
targets['drugbank_id'] = targets['drugbank_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

drugbank_chembl_map = targets[['chembl_id', 'drugbank_id']].dropna().drop_duplicates().rename({'chembl_id': 'filler'}, axis=1)
targets = targets.merge(drugbank_chembl_map, how='left', on='drugbank_id')
targets['chembl_id'] = targets['chembl_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

# Filling in missing values from Guide to Pharmacology by name
gn = pd.read_csv('./Other/gtp_ligand_id_mapping.csv', skiprows=1)[['Name', 'PubChem CID', 'ChEMBl ID', 'DrugBank ID']].drop_duplicates()
gn = gn.set_axis(['name', 'cid', 'chembl_id', 'drugbank_id'], axis=1).dropna(subset=['name'])
gn['name'] = gn['name'].str.upper()
gn = gn.loc[gn['name'].str.len() >= 5]

name_chembl_map = gn[['name', 'chembl_id']].dropna().drop_duplicates().rename({'chembl_id': 'filler'}, axis=1)
targets = targets.merge(name_chembl_map, how='left', on='name')
targets['chembl_id'] = targets['chembl_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

name_drugbank_map = gn[['name', 'drugbank_id']].dropna().drop_duplicates().rename({'drugbank_id': 'filler'}, axis=1)
targets = targets.merge(name_drugbank_map, how='left', on='name')
targets['drugbank_id'] = targets['drugbank_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

name_cid_map = gn[['name', 'cid']].dropna().drop_duplicates().rename({'cid': 'filler'}, axis=1)
targets = targets.merge(name_cid_map, how='left', on='name')
targets['cid'] = targets['cid'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

chembl_drugbank_map = gn[['chembl_id', 'drugbank_id']].dropna().drop_duplicates().rename({'drugbank_id': 'filler'}, axis=1)
targets = targets.merge(chembl_drugbank_map, how='left', on='chembl_id')
targets['drugbank_id'] = targets['drugbank_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

drugbank_chembl_map = gn[['chembl_id', 'drugbank_id']].dropna().drop_duplicates().rename({'chembl_id': 'filler'}, axis=1)
targets = targets.merge(drugbank_chembl_map, how='left', on='drugbank_id')
targets['chembl_id'] = targets['chembl_id'].fillna(targets['filler'])
targets = targets.drop('filler', axis=1)

# Converting all to CIDs
targets['chembl_id'].dropna().drop_duplicates().to_csv('./Drugs/Conversion/chembl_ids.txt', index=False, header=False)
targets['drugbank_id'].dropna().drop_duplicates().to_csv('./Drugs/Conversion/drugbank_ids.txt', index=False, header=False)
targets.loc[targets['cid'].isna()]['name'].drop_duplicates().to_csv('./Drugs/Conversion/no_cid_names.txt', index=False, header=False)
targets['cid'] = targets['cid'].astype(float)
c1 = pd.read_csv('./Drugs/Conversion/chembl_ids_to_cid.txt', sep='\t', header=None).set_axis(['chembl_id','cid'],axis=1).dropna()
c1 = pd.concat([c1,targets[['chembl_id','cid']].dropna()]).drop_duplicates()
c2 = pd.read_csv('./Drugs/Conversion/drugbank_ids_to_cid.txt', sep='\t', header=None).set_axis(['drugbank_id','cid'],axis=1).dropna()
c2 = pd.concat([c2,targets[['drugbank_id','cid']].dropna()]).drop_duplicates()
c3 = pd.read_csv('./Drugs/Conversion/names_to_cid.txt', sep='\t', header=None).set_axis(['name','cid'],axis=1).dropna()
c3 = pd.concat([c3,targets[['name','cid']].dropna()]).drop_duplicates()

other = targets.loc[(targets['cid'].notna()) & (targets['chembl_id'].isna()) & (targets['drugbank_id'].isna())]
targets = targets.drop('cid',axis=1)
c1 = targets.merge(c1, how='left')
c2 = targets.merge(c2, how='left')
c3 = targets.merge(c3, how='left')
targets = pd.concat([c1,c2,c3,other]).drop_duplicates()

# Defining drug types
targets['type'] = targets['type'].str.upper()
drug_type = targets[['chembl_id','drugbank_id','cid','name','source','priority','type']].dropna(subset='type').drop_duplicates()
targets = targets.drop('type',axis=1).drop_duplicates()
drug_type_map = {
    "SMALL MOLECULE": "5. SMALL MOLECULE",
    "SYNTHETIC ORGANIC": "5. SMALL MOLECULE",
    "PEPTIDE": "3. PEPTIDE",
    "ANTIBODY": "2. ANTIBODY",
    "BIOTECH": "6. UNSPECIFIED BIOLOGIC",
    "METABOLITE": "5. SMALL MOLECULE",
    "NATURAL PRODUCT": "5. SMALL MOLECULE",
    "PROTEIN": "4. PROTEIN",
    "UNKNOWN": "7. UNKNOWN",
    "ANTIBODY DRUG CONJUGATE": "1. ANTIBODY DRUG CONJUGATE",
    "INORGANIC": "5. SMALL MOLECULE", # Lithium and NO; classified by ChEMBL as small molecule
    "OLIGOSACCHARIDE": "3. OLIGOSACCHARIDE",
    "OLIGONUCLEOTIDE": "3. OLIGONUCLEOTIDE",
    "ENZYME": "4. PROTEIN",
    "GENE": "3. GENE",
    "CELL": "3. CELL"
}
drug_type['type'] = drug_type['type'].map(drug_type_map)
drug_type = drug_type.sort_values(['type','source','priority']).drop_duplicates(['chembl_id','drugbank_id','cid','name']).drop(['source','priority'],axis=1)
drug_type['type'] = drug_type['type'].str.split('. ', regex=False).str[1]
targets = targets.merge(drug_type[['cid','type']].dropna().drop_duplicates('cid').rename({'type':'type1'},axis=1), on='cid', how='left')
targets = targets.merge(drug_type[['chembl_id','type']].dropna().drop_duplicates('chembl_id').rename({'type':'type2'},axis=1), on='chembl_id', how='left')
targets = targets.merge(drug_type[['drugbank_id','type']].dropna().drop_duplicates('drugbank_id').rename({'type':'type3'},axis=1), on='drugbank_id', how='left')
targets = targets.merge(drug_type[['name','type']].dropna().drop_duplicates('name').rename({'type':'type4'},axis=1), on='name', how='left')
targets['type'] = targets['type1'].fillna(targets['type2']).fillna(targets['type3']).fillna(targets['type4'])
targets = targets.drop(['type1','type2','type3','type4'],axis=1).drop_duplicates()

# Standardizing action names
category_map = {
    "MODULATOR": "MODULATOR",
    "BINDER": "BINDING AGENT",
    "LIGAND": "BINDING AGENT",
    "BINDING AGENT": "BINDING AGENT",
    "COFACTOR": "COFACTOR",
    "STABILISER": "STABILISER",
    "SUBSTRATE": "SUBSTRATE",
    "DISRUPTING AGENT": "DISRUPTING AGENT",
    "HYDROLYTIC ENZYME": "HYDROLYTIC ENZYME",
    "OTHER": "UNSPECIFIED",
    "OTHER/UNKNOWN": "UNSPECIFIED",
    "EXOGENOUS PROTEIN": "GENE OR PROTEIN REPLACEMENT",
    "PRODUCT OF": "UNSPECIFIED",
    "CROSS-LINKING AGENT": "CROSS-LINKING AGENT",
    "ANTIBODY": "BINDING AGENT",
    "REGULATOR": "MODULATOR",
    "CHAPERONE": "STABILISER",
    "VACCINE ANTIGEN": "OTHER SPECIFIED",
    "UNKNOWN": "UNSPECIFIED",
    "EXOGENOUS GENE": "GENE OR PROTEIN REPLACEMENT",
    "STABILIZATION": "STABILISER",
    "ALLOSTERIC MODULATOR": "MODULATOR",
    "BINDING": "BINDING AGENT",
    "COMPONENT OF": "UNSPECIFIED",
    "OXIDIZER": "OTHER SPECIFIED",
    "PROTEOLYTIC ENZYME": "PROTEOLYTIC ENZYME",
    "CLEAVAGE": "OTHER SPECIFIED",
    "CHELATING AGENT": "CHELATING AGENT",
    "CHELATOR": "CHELATING AGENT",
    "INCORPORATION INTO AND DESTABILIZATION": "DISRUPTING AGENT",
    "CARRIER": "OTHER SPECIFIED",
    "GENE REPLACEMENT": "GENE OR PROTEIN REPLACEMENT",
    "MULTITARGET": "UNSPECIFIED"
}
targets.loc[targets['moa'] == 'Other', 'action'] = targets['action'].map(category_map)
targets = targets.loc[(targets['action'] != 'UNSPECIFIED')]
targets = targets.reset_index(drop=True)

# Assign IDs based on disjoint sets
targets['id'] = assign_doe_ids(targets)

# Retaining mappings for SIDER
targets[['cid','id']].dropna().drop_duplicates().to_csv('./Drugs/DOE IDs/cid_to_doeid.csv',index=False)
targets[['chembl_id','id']].dropna().drop_duplicates().to_csv('./Drugs/DOE IDs/chembl_to_doeid.csv',index=False)
targets[['drugbank_id','id']].dropna().drop_duplicates().to_csv('./Drugs/DOE IDs/drugbank_to_doeid.csv',index=False)
targets = targets.sort_values(['source','priority']).drop_duplicates(['id','gene','source']).reset_index(drop=True)

# Address drugs with conflicting MOAs
targets['weight'] = 1.0
targets.loc[targets['source'].isin(['1. CHEMBL','2. DrugBank']), 'weight'] = 2.0 # Prioritize CHEMBL and DrugBank over GTP data
cfr = targets.groupby(['id','gene','moa'])['source'].unique().reset_index()
cfr = cfr.merge(targets.groupby(['id','gene','moa'])['weight'].sum().reset_index())
cfr.loc[cfr['moa'] == 'Other', 'weight'] = cfr['weight']*0.75 # Slightly prioritize positive/negative over other

genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
genes = genes.loc[~genes['gene'].str.contains('ENSG')]
dc = cfr.groupby(['id','gene'])['moa'].nunique().reset_index()
dc = dc.loc[dc['gene'].isin(genes['gene'])]
print(len(dc[['id','gene']].drop_duplicates()))
print(len(dc.loc[dc['moa'] > 1]))

cfr = cfr.sort_values(['id','gene','weight'], ascending=[True,True,False]).drop_duplicates(['id','gene'])
cfr['sources'] = cfr['source'].astype(str).str.replace('1. ','').str.replace('2. ','').str.replace('3. ','')

targets = cfr[['id','gene','moa','sources']].merge(targets)
targets = targets.sort_values(['source','priority']).drop_duplicates(['id','gene']).reset_index(drop=True)
targets = targets.drop(['source','priority','weight'],axis=1)
targets['type'] = targets['type'].fillna('UNKNOWN')

targets = targets[['id','type','gene','moa','action','sources']].drop_duplicates(['id','gene'])

a = cid_phase.dropna().merge(pd.read_csv('./Drugs/DOE IDs/cid_to_doeid.csv'))[['id','phase']]
b = drugbank_phase.dropna().merge(pd.read_csv('./Drugs/DOE IDs/drugbank_to_doeid.csv'))[['id','phase']]
c = chembl_phase.dropna().merge(pd.read_csv('./Drugs/DOE IDs/chembl_to_doeid.csv'))[['id','phase']]
phase = pd.concat([a,b,c]).groupby('id')['phase'].max().reset_index()
targets = targets.merge(phase, how='left')
targets['phase'] = targets['phase'].fillna(-1)

targets.to_pickle('./Drugs/targets.pkl')


In [13]:
targets = pd.read_pickle('./Drugs/targets.pkl')
targets = targets.groupby(['moa','action'])['id'].nunique().reset_index()
targets['moa'] = targets['moa'].map({'Negative modulator':'Inhibitor','Positive modulator':'Activator','Other':'Other'})
targets.to_excel('./Supplemental tables/target_counts.xlsx', index=False)


In [9]:
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
genes = genes.loc[~genes['gene'].str.contains('ENSG')]
targets = pd.read_pickle('./Drugs/targets.pkl')
targets = targets.loc[targets['gene'].isin(genes['gene'])]
print(targets['gene'].nunique())
print(targets['id'].nunique())


2553
7341


## Open Targets data

In [None]:
disease = pd.read_pickle("./OT/Raw/diseases.pkl")[['id','dbXRefs','name']]
disease = disease.explode('dbXRefs').dropna(subset='dbXRefs')
disease['terminology'] = disease['dbXRefs'].str.split(':').str[0]
disease['code'] = disease['dbXRefs'].str.split(':').str[1].astype(str)
disease = disease[['id','name','terminology','code']]
disease.loc[disease['terminology'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
disease.loc[disease['terminology'].str.contains('ICD10'), 'icd_type'] = 'ICD10'

# "umls.pkl" is a pickled version of MRCONSO.RRF available from https://www.nlm.nih.gov/research/umls/index.html
umls = pd.read_pickle('/sc/arion/projects/GENECAD/Robert/Drug_Phewas/Drugs/Resources/umls.pkl')[[0,11,13]]
umls_icd = umls.loc[umls[11].isin(['ICD10CM','ICD10PCS','ICD10'])]
umls_icd = umls_icd[[0,13]].drop_duplicates().rename({13:'ICD10'},axis=1)
umls_other = umls.loc[~umls[11].str.contains('ICD9|ICD10')]
umls_other = umls_other.merge(umls_icd, on=0)
umls_other = umls_other[[13,11,'ICD10']].rename({13:'code',11:'umls_terminology'},axis=1).drop_duplicates()

d1 = disease.loc[disease['terminology'] == 'ICD10']
d1['ICD'] = d1['code'].str[:3]
d1['ICD_3'] = d1['code'].str[:5]
d1a = custom[['icd','custom_code']].drop_duplicates().dropna().set_axis(['ICD','Code'],axis=1)
d1a = d1.merge(d1a)[['id','name','terminology','code','Code']]
d1b = d1.loc[d1['ICD'].isin(merged_pheno['Code'])].rename({'ICD':'Code'},axis=1)[['id','name','terminology','code','Code']]
d1c = custom[['icd','custom_code']].drop_duplicates().dropna().set_axis(['ICD_3','Code'],axis=1)
d1c = d1.merge(d1c)[['id','name','terminology','code','Code']]

d2 = custom[['ot_code','custom_code']].drop_duplicates().dropna().set_axis(['id','Code'],axis=1)
d2 = disease.merge(d2)[['id','name','terminology','code','Code']]

d3 = disease.merge(umls_icd.set_axis(['code','ICD'],axis=1))
d3['ICD'] = d3['ICD'].str[:3]
d3 = d3.loc[d3['ICD'].isin(merged_pheno['Code'])].drop_duplicates().rename({'ICD':'Code'},axis=1)
d3 = d3[['id','name','terminology','code','Code']]

disease_umls = disease.copy()
disease_umls = disease_umls.merge(umls_other)
check = pd.read_excel('/sc/arion/projects/GENECAD/Robert/Drug_Phewas/OT/Raw/umls_equivalence.xlsx').dropna()
disease_umls = disease_umls.merge(check)
disease_umls = disease_umls[['id','name','terminology','code','ICD10']]
disease_umls['ICD'] = disease_umls['ICD10'].str[:3]
disease_umls['ICD_3'] = disease_umls['ICD10'].str[:5]
du1 = custom[['icd','custom_code']].drop_duplicates().dropna().set_axis(['ICD','Code'],axis=1)
du1 = disease_umls.merge(du1)[['id','name','terminology','code','Code']]
du2 = disease_umls.loc[disease_umls['ICD'].isin(merged_pheno['Code'])]
du2 = du2.rename({'ICD':'Code'},axis=1)[['id','name','terminology','code','Code']]
du3 = custom[['icd','custom_code']].drop_duplicates().dropna().set_axis(['ICD_3','Code'],axis=1)
du3 = disease_umls.merge(du3)[['id','name','terminology','code','Code']]

map = pd.concat([d1a,d1b,d1c,d2,d3,du1,du2,du3])[['id','name','terminology','code','Code']].drop_duplicates()
map.to_pickle('./OT/Final/diseases.pkl')


In [13]:
indications = pd.read_pickle("./OT/Raw/indication.pkl")

exploded_indications = indications[['id','approvedIndications']].explode('approvedIndications').dropna()
exploded_indications = exploded_indications.rename({'approvedIndications':'disease'},axis=1)
exploded_indications['maxPhaseForIndication'] = 4

# Function to process each row and extract the desired information
def process_row(row):
    id_value = row['id']
    result_rows = []
    for indication in row['indications']:
        disease = indication['disease']
        max_phase = indication['maxPhaseForIndication']
        result_rows.append({'id': id_value, 'disease': disease, 'maxPhaseForIndication': max_phase})
    return result_rows

# Apply the function to each row and create a list of new rows
new_rows = []
for _, row in indications.iterrows():
    new_rows.extend(process_row(row))

# Create a new DataFrame from the list of new rows
expanded_df = pd.DataFrame(new_rows)

#

indications = pd.concat([expanded_df, exploded_indications])
indications = indications.rename({'maxPhaseForIndication':'phase'},axis=1)
indications = indications.sort_values(['id','disease','phase'], ascending=[True,True,False])
indications = indications.drop_duplicates().reset_index(drop=True)
indications.to_pickle('./OT/Final/indication_all.pkl')


In [None]:
map = pd.read_pickle('./OT/Final/diseases.pkl')
map['code'] = map['code'].astype(str)

phecode_map = map.loc[map['Code'].notna()][['id','Code']]

mondo_map = map.loc[map['terminology'] == 'MONDO'][['code','id']]
mondo_map = mondo_map.merge(phecode_map)
mondo_map['id'] = 'MONDO_' + mondo_map['code'].astype(str)
mondo_map = mondo_map[['id','Code']]

hp_map = map.loc[map['terminology'] == 'HP'][['code','id']]
hp_map = hp_map.merge(phecode_map)
hp_map['id'] = 'HP_' + hp_map['code'].astype(str)
hp_map = hp_map[['id','Code']]

orpha_map = map.loc[map['terminology'] == 'Orphanet'][['code','id']]
orpha_map = orpha_map.merge(phecode_map)
orpha_map['id'] = 'Orphanet_' + orpha_map['code'].astype(str)
orpha_map = orpha_map[['id','Code']]

efo_map = map.loc[map['terminology'] == 'EFO'][['code','id']]
efo_map = efo_map.merge(phecode_map)
efo_map['id'] = 'EFO_' + efo_map['code'].astype(str)
efo_map = efo_map[['id','Code']]

eva_map = pd.concat([phecode_map, mondo_map, hp_map, orpha_map, efo_map]).drop_duplicates(['id','Code'])
eva_map = eva_map.rename({'id':'disease'},axis=1)

#

indications = indications.merge(eva_map)
indications = indications.groupby(['id','disease','Code'])['phase'].max().reset_index()
indications.to_pickle('./OT/Final/indication_cleaned.pkl')


In [64]:
ind = pd.read_pickle('./OT/Final/indication_cleaned.pkl')
targets = pd.read_pickle('./Drugs/targets.pkl').merge(pd.read_csv('./Drugs/DOE IDs/chembl_to_doeid.csv'))
ind = ind.rename({'id':'chembl_id'},axis=1).merge(targets)[['chembl_id','Code','phase','moa','gene']]
ind = ind.drop_duplicates().dropna(subset=['Code','moa','gene'])
ind.to_pickle('./Drugs/ot.pkl')


## FDA/EU data

In [67]:
# Assume phase = 0.5 unless explicitly says approved (phase = 4) or withdrawn (phase = -1)
fda = pd.read_excel('./Other/Search_results.xlsx')[['Generic Name','Trade Name','Orphan Designation','Orphan Designation Status','Marketing Approval Date']]
fda['phase'] = 0.5
fda.loc[(fda['Orphan Designation Status'] == 'Designated/Designation Withdrawn or Revoked') & (fda['Marketing Approval Date'].isna()), 'phase'] = -1
fda.loc[fda['Marketing Approval Date'].notna(), 'phase'] = 4
fda = fda[['Generic Name','Trade Name','Orphan Designation','phase']].set_axis(['product','trade name','indication','phase'],axis=1)
fda1 = fda.drop('trade name',axis=1)
fda2 = fda.drop('product',axis=1).rename({'trade name':'product'},axis=1)

eur = pd.read_excel('./Other/Union Register of medicinal products - Public health - European Commission.xlsx', skiprows=2)[['Product','Indication','Tradename - EU product # - Implemented on']]
eur['phase'] = 0.5
eur.loc[eur['Tradename - EU product # - Implemented on'] != '-', 'phase'] = 4
eur = eur.drop('Tradename - EU product # - Implemented on',axis=1).set_axis(['product','indication','phase'],axis=1)

rd = pd.concat([fda1,fda2,eur]).sort_values('phase',ascending=False)
rd['product'] = rd['product'].str.lower()
rd['indication'] = rd['indication'].str.lower()
rd = rd.drop_duplicates().dropna()
rd = rd.loc[~rd['indication'].str.contains('detect')]
rd['Search'] = rd['indication'].str.replace('treatment of','').str.replace('prevention of','').str.replace('patients with','').str.replace(r'[^a-zA-Z0-9 ]', '', regex=True).str.strip()

ct = pd.read_csv('./ChEMBL/chembl_35_drug_targets.csv')[['molregno','chembl_id']]
ctd = pd.read_csv('./ChEMBL/chembl_35_molecule_synonyms.csv')[['molregno','synonyms']]
ct = ct.merge(ctd).drop_duplicates().rename({'synonyms':'product'},axis=1)
ct['product'] = ct['product'].str.lower()
rd = rd.merge(ct)
#rd['Search'].drop_duplicates().to_excel('./Phenotyping/rare_phenotypes.xlsx', index=False)

matches = pd.read_excel('./Phenotyping/rare_matches.xlsx').dropna()
matches['icd_3'] = matches['icd'].str[:3]
m1 = matches.loc[matches['icd_3'].isin(merged_pheno['Code'])][['Search','icd_3']].rename({'icd_3':'Code'},axis=1)
m2 = matches.merge(custom[['icd','custom_code']].dropna().drop_duplicates())[['Search','custom_code']].rename({'custom_code':'Code'},axis=1)
matches = pd.concat([m1,m2])
rd = rd.merge(matches)

moa = pd.read_pickle('./Drugs/targets.pkl').merge(pd.read_csv('./Drugs/DOE IDs/chembl_to_doeid.csv'))[['chembl_id','gene','moa']]
rd = rd.merge(moa)
rd = rd[['Code','gene','moa','phase']]
rd = rd.sort_values('phase',ascending=False).drop_duplicates(['Code','gene','moa'])
rd.to_pickle('./Drugs/fdaeu.pkl')



## Combine

In [None]:
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
ot = pd.read_pickle('./Drugs/ot.pkl')[['Code','phase','gene','moa']]
fdaeu = pd.read_pickle('./Drugs/fdaeu.pkl')[['Code','phase','gene','moa']]

df = pd.concat([ot,fdaeu])
df = df.loc[df['gene'].isin(genes['gene'])]
df = df.loc[df['phase'] > 0]
df = pd.pivot_table(df, index=['Code','gene'], columns='moa', values='phase', aggfunc='max').reset_index()
df = df.set_axis(['Code','gene','neg','other','pos'],axis=1)
df.to_pickle('./Drugs/combined.pkl')
