In [7]:
import pandas as pd

## Open Targets data

### Disease to phecode mappings

In [12]:
umls_drugs = pd.read_pickle('./Raw files/drugbank_atc.pkl')

drugs = pd.read_parquet("./OT_2024.9/molecule.parquet")
drugs['drugbank'] = drugs['crossReferences'].apply(
    lambda refs: next((ref[1][0] for ref in refs if ref and ref[0] == 'drugbank'), None) if refs else None
)
drugs = drugs[['id','blackBoxWarning','name','drugbank','maximumClinicalTrialPhase','isApproved','synonyms','crossReferences','description']]
drugs = drugs.merge(umls_drugs, how='left')

atc = pd.read_csv('./Raw files/ATC.csv')[['Class ID','Preferred Label']]
atc['atc'] = atc['Class ID'].str.split('ATC/').str[-1]
atc['name'] = atc['Preferred Label'].str.lower()
atc = atc[['name','atc']].drop_duplicates()

empty = drugs.loc[drugs['atc'].isna()].drop('atc',axis=1)
empty['name'] = empty['name'].str.lower()
empty = empty.explode('synonyms')
empty['synonyms'] = empty['synonyms'].str.lower()
empty = empty.merge(atc, on='name', how='left')
empty.loc[empty['atc'].isna(), 'name'] = empty['synonyms']
empty = empty.merge(atc, on='name', how='left')
empty['atc'] = empty['atc_x'].fillna(empty['atc_y'])
empty = empty.drop(['atc_x','atc_y'],axis=1)
empty = empty.loc[empty['atc'].notna()]
empty = empty[['id','atc']].rename({'atc':'atc_filled'},axis=1).drop_duplicates()

drugs = drugs.merge(empty, on='id', how='left')
drugs.loc[drugs['atc'].isna(), 'atc'] = drugs['atc_filled']
drugs = drugs.drop(['atc_filled','synonyms'],axis=1)

drugs['name'] = drugs['name'].str.lower()
drugs.to_pickle('./Cleaned files/molecule_cleaned.pkl')

In [14]:
pi = pd.read_excel('./Phecode/phecodeX_info.xls')
pi['phecodeX_code'] = pi['phecode'].str.split('_').str[1]

ses = pd.read_csv('./Raw files/hpo-phecodeX_linkswithHPOchildrenExpansion_StrongEvidenceSpecific.tsv', sep='\t')
ses['phecodeX_code'] = ses['phecodeX_code'].astype(str)
mp = pd.read_csv('./Raw files/ChildHPO_Precision_X.tsv', sep='\t')
mp['phecodeX_code'] = mp['phecodeX_code'].astype(str)

op = pd.concat([ses[['phecodeX_code','phecodeX_label','phecodeX_category']], mp[['phecodeX_code','phecodeX_label','phecodeX_category']]])
op = op.drop_duplicates()
op = op.merge(pi[['phecode','phecode_string']].rename({'phecode_string':'phecodeX_label'},axis=1), how='left')
up = op.loc[op['phecode'].isna()].drop(['phecode'],axis=1)
op = op.loc[op['phecode'].notna()]
up = up.merge(pi[['phecode','phecodeX_code','phecode_string']], how='inner')
up = up.loc[~up['phecodeX_code'].isin(['962.11','324.6','352.2','374.7','375.113','765.3','767.1',
                                       '750.12','751.2','757.2','772.1','772.2','978','978.3',
                                       '973','975','975.2','705.12','618','582.1','582.2','582.3',
                                       '618.1','626.4','374.4','200.11','751.4','168.3','962',
                                       '962.1','771','754','752','753.3','977','775','770.4',
                                       '770.3','772','771.1','771.3','976','969'])]
up = up.drop('phecode_string',axis=1)
op = pd.concat([op,up]).drop_duplicates()
op = op.merge(pi[['phecode','phecode_string']])

ses = ses.merge(op)[['phecode','hpo_code','phecode_string','hpo_label','hpo_child','StrongEvidenceSpecific','StrongEvidenceBroad']]
mp = mp.merge(op)[['phecode','hpo_code','phecode_string','hpo_label','hpo_child','StrongEvidenceSpecific','StrongEvidenceBroad']]
mp = mp.loc[(mp['StrongEvidenceBroad'] == True) | (mp['StrongEvidenceSpecific'] == True)]
map = pd.concat([ses,mp]).drop_duplicates(['phecode','hpo_code']).rename({'hpo_code':'id'},axis=1)
map.to_pickle('./Raw files/hpo_phecodex_map.pkl')

In [None]:
inc = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')['phecode'].sort_values().to_list()
p1 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_CM.csv')
p2 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_WHO.csv')
comb = pd.concat([p1,p2])
comb = comb.loc[comb['phecode'].isin(inc)]
comb = comb.rename({'ICD':'code'},axis=1)
comb.loc[comb['vocabulary_id'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
comb.loc[comb['vocabulary_id'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
comb['code'] = comb['code'].astype(str)

hp = pd.read_pickle('./Raw files/hpo_phecodex_map.pkl')[['id','phecode']].drop_duplicates()
hp['code'] = hp['id'].str.split('_').str[1].astype(str)
hp = hp[['phecode','code']]
hp['terminology'] = 'HP'
hp = hp.loc[hp['phecode'].isin(inc)]

disease = pd.read_parquet('./OT_2024.9/diseases.parquet')[['id','dbXRefs','name']]
disease = disease.explode('dbXRefs').dropna(subset='dbXRefs')
disease['terminology'] = disease['dbXRefs'].str.split(':').str[0]
disease['code'] = disease['dbXRefs'].str.split(':').str[1].astype(str)
disease = disease[['id','name','terminology','code']]
disease.loc[disease['terminology'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
disease.loc[disease['terminology'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
disease1 = disease.merge(comb, on=['code','icd_type'], how='left')
disease2 = disease.merge(hp, on=['code','terminology'], how='left')
disease = pd.concat([disease1,disease2]).drop(['icd_type','vocabulary_id'],axis=1)
disease.to_pickle('./Cleaned files/diseases_cleaned.pkl')

In [56]:
map = pd.read_pickle('./Cleaned files/diseases_cleaned.pkl')
map['code'] = map['code'].astype(str)
phecode_map = map.loc[map['phecode'].notna()][['id','phecode']]
mondo_map = map.loc[map['terminology'] == 'MONDO'][['code','id']]
mondo_map = mondo_map.merge(phecode_map)
mondo_map['id'] = 'MONDO_' + mondo_map['code'].astype(str)
mondo_map = mondo_map[['id','phecode']]
hp_map = map.loc[map['terminology'] == 'HP'][['code','id']]
hp_map = hp_map.merge(phecode_map)
hp_map['id'] = 'HP_' + hp_map['code'].astype(str)
hp_map = hp_map[['id','phecode']]
orpha_map = map.loc[map['terminology'] == 'Orphanet'][['code','id']]
orpha_map = orpha_map.merge(phecode_map)
orpha_map['id'] = 'Orphanet_' + orpha_map['code'].astype(str)
orpha_map = orpha_map[['id','phecode']]
efo_map = map.loc[map['terminology'] == 'EFO'][['code','id']]
efo_map = efo_map.merge(phecode_map)
efo_map['id'] = 'EFO_' + efo_map['code'].astype(str)
efo_map = efo_map[['id','phecode']]
inc = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')['phecode'].sort_values().to_list()
hp = pd.read_pickle('./Raw files/hpo_phecodex_map.pkl')[['id','phecode']].drop_duplicates()
hp = hp.loc[hp['phecode'].isin(inc)]
efo_phecode = pd.read_excel('./Raw files/EFO_phecode.xlsx')\
                .merge(pd.read_excel('./Raw files/Phecode_1.2_X_crosswalk.xlsx'), on='phecode v1.2')\
                [['EFO','phecodeX','phecodeX string']]
efo_phecode['EFO'] = efo_phecode['EFO'].str.split('efo/').str[1]
efo_phecode = efo_phecode[['EFO','phecodeX']].set_axis(['id','phecode'],axis=1).dropna()
full_map = pd.concat([hp, phecode_map, mondo_map, hp_map, orpha_map, efo_map, efo_phecode]).drop_duplicates(['id','phecode'])
full_map = full_map.rename({'id':'disease'},axis=1)
full_map.to_pickle('./Raw files/full_map.pkl')

### Drug indications

In [58]:
indications = pd.read_parquet('./OT_2024.9/indication.parquet')

exploded_indications = indications[['id','approvedIndications']].explode('approvedIndications').dropna()
exploded_indications = exploded_indications.rename({'approvedIndications':'disease'},axis=1)
exploded_indications['maxPhaseForIndication'] = 4

def process_row(row):
    id_value = row['id']
    result_rows = []
    for indication in row['indications']:
        disease = indication['disease']
        max_phase = indication['maxPhaseForIndication']
        result_rows.append({'id': id_value, 'disease': disease, 'maxPhaseForIndication': max_phase})
    return result_rows
new_rows = []
for _, row in indications.iterrows():
    new_rows.extend(process_row(row))
expanded_df = pd.DataFrame(new_rows)

#

indications = pd.concat([expanded_df, exploded_indications])
indications = indications.rename({'maxPhaseForIndication':'phase'},axis=1)
indications = indications.sort_values(['id','disease','phase'], ascending=[True,True,False])
indications = indications.drop_duplicates().reset_index(drop=True)

#

full_map = pd.read_pickle('./Raw files/full_map.pkl')
indications = indications.merge(full_map)
indications.to_pickle('./Cleaned files/indication_cleaned.pkl')

In [None]:
drugs = pd.read_pickle('./Cleaned files/indication_cleaned.pkl')
drugs = drugs.merge(pd.read_pickle('./Cleaned files/molecule_cleaned.pkl')[['id','name','atc']], how='left')
drugs = drugs.loc[~drugs['atc'].str.startswith('J', na=False)]
drugs = drugs[['id','phase','phecode']]

moa = pd.read_parquet('./OT_2024.9/mechanismOfAction.parquet')
moa['moa'] = 'other'
moa.loc[moa['actionType'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER']),'moa'] = 'inhibitor'
moa.loc[moa['actionType'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'activator'
moa = moa[['moa','chemblIds','targets']]
moa = moa.explode('chemblIds')
moa = moa.explode('targets')
moa = moa.merge(pd.read_parquet('./OT_2024.9/targets.parquet')[['id','approvedSymbol']].rename({'id':'targets'},axis=1))
moa = moa.rename({'chemblIds':'id'},axis=1).drop_duplicates()

drugs = moa.merge(drugs,how='left')[['id','phase','phecode','moa','approvedSymbol']]
drugs = drugs.rename({'approvedSymbol':'gene'},axis=1)
drugs.to_pickle('./Cleaned files/opentargets.pkl')


### Direct and indirect association scores

In [None]:
ds = pd.read_parquet('./OT_2024.9/associationByOverallDirect.parquet').set_axis(['disease','targetId','direct_score','direct_count'],axis=1)
ids = pd.read_parquet('./OT_2024.9/associationByOverallIndirect.parquet').set_axis(['disease','targetId','indirect_score','indirect_count'],axis=1)
ass = ds.merge(ids, on=['disease','targetId'], how='outer').fillna(0)
full_map = pd.read_pickle('./Raw files/full_map.pkl')
ass = ass.merge(full_map)
ass = ass.merge(pd.read_parquet('./OT_2024.9/targets.parquet')[['id','approvedSymbol']].drop_duplicates().set_axis(['targetId','gene'],axis=1))
ass = ass.groupby(['phecode','gene'])[['direct_score','direct_count','indirect_score','indirect_count']].max().reset_index()
ass.to_pickle('./Cleaned files/association_scores_cleaned.pkl')

## Clinical variants

### EVA-ClinVar

In [59]:
eva = pd.read_parquet('./OT_2024.9/evidence_eva.parquet').rename({'targetId':'target','diseaseId':'disease'},axis=1).sort_values('score',ascending=False)
full_map = pd.read_pickle('./Raw files/full_map.pkl')
eva = eva.merge(full_map).drop_duplicates(['target','variantId','phecode']).explode('clinicalSignificances')
eva = eva.loc[eva['clinicalSignificances'].isin(['pathogenic', 'drug response', 'likely pathogenic', 'association', 'likely risk allele', 'protective'])]
eva = eva.loc[eva['confidence'] != 'no assertion criteria provided']
targets = pd.read_parquet('./OT_2024.9/targets.parquet')[['id','approvedSymbol']].drop_duplicates()
eva = eva.merge(targets, left_on='target', right_on='id')
eva = eva[['approvedSymbol','variantId','phecode','score']].set_axis(['gene','variant','phecode','score'],axis=1)
eva = eva.loc[eva['variant'].str.contains('_',na=False)]
eva.to_pickle('./Cleaned files/eva_clinvar.pkl')

### Locus2gene

In [None]:
l2g = pd.read_parquet('./OT_2024.9/evidence_l2g.parquet').rename({'targetId':'target','diseaseId':'disease'},axis=1).sort_values('score',ascending=False)
full_map = pd.read_pickle('./Raw files/full_map.pkl')
l2g = l2g.merge(full_map).drop_duplicates(['target','variantId','phecode'])
targets = pd.read_parquet('./OT_2024.9/targets.parquet')[['id','approvedSymbol']].drop_duplicates()
l2g = l2g.merge(targets, left_on='target', right_on='id').rename({'approvedSymbol':'gene'},axis=1)

l2g['effect'] = 0
l2g.loc[(l2g['variantEffect'] == 'GoF') & (l2g['directionOnTrait'] == 'protect'), 'effect'] = 1
l2g.loc[(l2g['variantEffect'] == 'GoF') & (l2g['directionOnTrait'] == 'risk'), 'effect'] = -1
l2g.loc[(l2g['variantEffect'] == 'LoF') & (l2g['directionOnTrait'] == 'protect'), 'effect'] = -1
l2g.loc[(l2g['variantEffect'] == 'LoF') & (l2g['directionOnTrait'] == 'risk'), 'effect'] = 1

l2g = l2g.sort_values(['phecode','gene','effect','score'], ascending=[True,True,True,False]).drop_duplicates(['phecode','gene','effect'])

temp = l2g[['phecode','gene','score']]
temp.loc[temp['score'] > 0.5, 'l2g'] = 1
temp.rename({'score':'l2g_score'},axis=1).to_pickle('./Cleaned files/l2g.pkl')

temp = pd.pivot(l2g,index=['gene','phecode'],columns='effect',values='score').reset_index().rename({-1:'l2g_inh',0:'l2g_neu',1:'l2g_act'},axis=1)
temp.fillna(0).to_pickle('./Cleaned files/l2g_dir.pkl')


### OMIM

In [None]:
p1 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_CM.csv')
p2 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_WHO.csv')
comb = pd.concat([p1,p2])
comb = comb.rename({'ICD':'code'},axis=1)
comb.loc[comb['vocabulary_id'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
comb.loc[comb['vocabulary_id'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
comb = comb[['phecode','code']]

umls = pd.read_csv('./Raw files/2024AB_MRCONSO.RRF', sep='|', header=None)[[0,11,13]]

umls_icd = umls.loc[umls[11].str.contains('ICD9|ICD10')]
umls_icd = umls_icd.merge(comb, left_on=13, right_on='code')[[0,'phecode']].drop_duplicates()

umls_omim = umls.loc[umls[11] == 'OMIM']
umls_omim = umls_omim.loc[~umls_omim[13].str.contains('MTHU')]
umls_omim = umls_omim.loc[~umls_omim[13].str.contains('.', regex=False)]
umls_omim[13] = umls_omim[13].astype(int)
umls_omim = umls_omim[[0,13]].merge(umls_icd, on=0)
umls_omim = umls_omim[[13,'phecode']].set_axis(['code','phecode'],axis=1)

map = pd.read_pickle('./Cleaned files/diseases_cleaned.pkl')
map = map.loc[map['terminology'].str.contains('OMIM')]
map = map.loc[map['phecode'].notna()][['code','phecode']]
map['code'] = map['code'].astype(int)
map = pd.concat([map,umls_omim]).drop_duplicates()

df = pd.read_csv('./Raw files/genemap2.txt', sep='\t', skiprows=3)[['Approved Gene Symbol','Phenotypes']].dropna()
df['Phenotypes'] = df['Phenotypes'].str.split(';')
df = df.explode('Phenotypes')
df['MIM'] = df['Phenotypes'].str.extract(r',\s*(\d+)\s*\(')
df['MIM'] = pd.to_numeric(df['MIM'], errors='coerce')
df = df[['Approved Gene Symbol','MIM']].set_axis(['Gene','MIM'],axis=1).dropna()
df['MIM'] = df['MIM'].astype(int)
df = df.rename({'MIM':'code'},axis=1)
df = df.merge(map)
df = df[['Gene','phecode']].drop_duplicates()
df.to_pickle('./Cleaned files/omim.pkl')

  umls = pd.read_csv('./2024AB_MRCONSO.RRF', sep='|', header=None)[[0,11,13]]


### HGMD

In [62]:
hid = pd.read_csv('./Raw files/HGMD_HPO_Parent_and_Root_2023_4_Mort.csv')
hid = hid.assign(parentAndRoot_HPO_codes=hid['parentAndRoot_HPO_codes'].str.split(',')).explode('parentAndRoot_HPO_codes')
hid['code'] = hid['parentAndRoot_HPO_codes'].str.replace('HP:','').str.lstrip('0')
hid = hid[['acc_num','code']]
hid['code'] = hid['code'].astype(str)

hp = pd.read_pickle('./Raw files/hpo_phecodex_map.pkl')[['phecode','id']]
hp['code'] = hp['id'].str.replace('HP_', '').str.lstrip('0').astype(int).astype(str)
hp = hp[['code','phecode']].drop_duplicates()
hid = hid.merge(hp).drop_duplicates().rename({'acc_num':'ID'},axis=1)

df = pd.read_csv('./Raw files/2023_4_hg38_fullinfo.tsv', sep='\t')
def parse_info(info_string):
    info_dict = {}
    for item in info_string.split(';'):
        if '=' in item:
            key, value = item.split('=', 1)
            info_dict[key] = value.strip('"')
    return info_dict
parsed_info = df['INFO'].apply(parse_info)
df_parsed = pd.DataFrame(parsed_info.tolist())

df = df.join(df_parsed)[['CHROM','POS','REF','ALT','DNA','ID','CLASS','GENE']]

hgmd = df.merge(hid)
hgmd.to_pickle('./Cleaned files/hgmd.pkl')

  df = pd.read_csv('./Raw files/2023_4_hg38_fullinfo.tsv', sep='\t')


### Combine

In [None]:
# LoGoFunc predictions

dv = pd.read_pickle('./Raw files/logofunc_preds.pkl')
lof_list = dv.loc[dv['Predicted_Label'] == 'LOF']['ID'].drop_duplicates().to_list()
gof_list = dv.loc[dv['Predicted_Label'] == 'GOF']['ID'].drop_duplicates().to_list()

# Without DOE

eva = pd.read_pickle('./Cleaned files/eva_clinvar.pkl').drop_duplicates()
eva = eva.groupby(['gene','phecode']).count().reset_index().rename({'variant':'EVA_count'},axis=1)
eva['EVA'] = 1

hgmd = pd.read_pickle('./Cleaned files/hgmd.pkl').drop_duplicates()
hgmd = hgmd.groupby(['GENE','phecode'])['DNA'].count().reset_index().rename({'DNA':'HGMD_count','GENE':'gene'},axis=1)
hgmd['HGMD'] = 1

omim = pd.read_pickle('./Cleaned files/omim.pkl').drop_duplicates().rename({'Gene':'gene'},axis=1)
omim['OMIM'] = 1

cv = eva.merge(hgmd, on=['gene','phecode'], how='outer').merge(omim, on=['gene','phecode'], how='outer')
cv.to_pickle('./Cleaned files/cv.pkl')

# With DOE

eva = pd.read_pickle('./Cleaned files/eva_clinvar.pkl').drop_duplicates()
eva['LOGO'] = eva['variant'].str.replace('_','-')

hgmd = pd.read_pickle('./Cleaned files/hgmd.pkl').drop_duplicates()
hgmd['LOGO'] = hgmd['CHROM'].astype(str) + '-' + hgmd['POS'].astype(str) + '-' + hgmd['REF'] + '-' + hgmd['ALT']

eva.loc[eva['LOGO'].isin(lof_list), 'EVA_act'] = 1
eva.loc[eva['LOGO'].isin(gof_list), 'EVA_inh'] = 1
eva.loc[(eva['EVA_act'].isna()) & (eva['EVA_inh'].isna()), 'EVA_neu'] = 1
eva = eva.drop_duplicates(['gene','phecode','variant'])
eva = eva.groupby(['phecode','gene'])[['EVA_act','EVA_inh','EVA_neu']].sum().reset_index()

hgmd.loc[hgmd['LOGO'].isin(lof_list), 'HGMD_act'] = 1
hgmd.loc[hgmd['LOGO'].isin(gof_list), 'HGMD_inh'] = 1
hgmd.loc[(hgmd['HGMD_act'].isna()) & (hgmd['HGMD_inh'].isna()), 'HGMD_neu'] = 1
hgmd = hgmd.drop_duplicates(['GENE','phecode','DNA']).rename({'GENE':'gene'},axis=1)
hgmd = hgmd.groupby(['phecode','gene'])[['HGMD_act','HGMD_inh','HGMD_neu']].sum().reset_index()

cv_dir = eva.merge(hgmd, on=['phecode','gene'], how='outer')
cv_dir.to_pickle('./Cleaned files/cv_dir.pkl')

## SIDER data

### Drug mechanisms

In [None]:
dc = pd.read_pickle('./Raw files/drug_conv.pkl')

#

ndr = pd.read_excel('./Raw files/ndr_list.xlsx')
ndr = ndr.loc[ndr['ORGANISM'] == 'Homo sapiens']
ndr['moa'] = 'other'
ndr.loc[ndr['MECHANISM_OF_ACTION'].str.contains('inhibitor|antagonist|blocker|negative allo|antisense|releasing|inverse agonist|negative modulator|degrader', na=False), 'moa'] = 'inhibitor'
ndr.loc[(~ndr['MECHANISM_OF_ACTION'].str.contains('inverse|antagonist', na=False)) & (ndr['MECHANISM_OF_ACTION'].str.contains('agonist', na=False)), 'moa'] = 'activator'
ndr.loc[ndr['MECHANISM_OF_ACTION'].str.contains('opener|activator|positive allo|positive modulator|partial agonist', na=False), 'moa'] = 'activator'

hgnc = pd.read_csv('./Raw files/hgnc_genes.txt', sep='\t')
hgnc = hgnc.sort_values(['HGNC ID','Status']).drop_duplicates('HGNC ID')
hgnc = hgnc[['HGNC ID','Approved symbol']].rename({'Approved symbol':'Gene'},axis=1)
conv = pd.read_csv('./Raw files/ndr_hgnc.tsv', sep='\t').set_axis(['ACCESSION','HGNC ID'],axis=1)
conv = conv.merge(hgnc)[['ACCESSION', 'Gene']]

ndr = ndr.merge(conv)
ndr = ndr[['PARENT_PREF_NAME','Gene','moa']].set_axis(['Name','Gene','moa'],axis=1)
ndr['Name'] = ndr['Name'].str.lower()
ndr = ndr.merge(dc[['name','CID']].dropna(), left_on='Name', right_on='name')
ndr = ndr[['CID','Gene','moa']].drop_duplicates()
ndr['source'] = 'NDR'

#

ct = pd.read_csv('./Raw files/chembl_35_drug_targets.csv')
ct = ct.loc[ct['organism'] == 'Homo sapiens']
ct = ct.loc[ct['syn_type'] == 'GENE_SYMBOL']
ct['moa'] = 'other'
ct.loc[ct['action_type'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER']),'moa'] = 'inhibitor'
ct.loc[ct['action_type'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'activator'
ct = ct[['pref_name','chembl_id','component_synonym','moa']].set_axis(['Name','CHEMBL','Gene','moa'],axis=1)
ct['Name'] = ct['Name'].str.lower()
ct = pd.concat([ct.merge(dc[['CHEMBL','CID']].dropna()),ct.merge(dc[['name','CID']].dropna(), left_on='Name', right_on='name')])
ct['Gene'] = ct['Gene'].str.replace('Synonyms=','')
ct = ct[['CID','Gene','moa']].drop_duplicates()
ct['source'] = 'ChEMBL'

dpa = pd.read_csv('./Raw files/drugbank_pharm_active_5.1.13.csv')
dpa = dpa.loc[dpa['Species'] == 'Humans']
dpa = dpa.merge(hgnc)
dpa = dpa.assign(Drugbank=dpa['Drug IDs'].str.split(';')).explode('Drugbank')
dpa = dpa[['Drugbank','Gene','UniProt ID','Species']].drop_duplicates()
dpa = dpa[['Drugbank','Gene']].merge(dc[['Drugbank','CID']].dropna())[['Drugbank','CID','Gene']].set_axis(['Drugbank','CID','Gene'],axis=1).drop_duplicates()
db_moa = pd.read_pickle('./Raw files/drugbank_moa_5.1.13.pkl')
dpa = dpa.merge(db_moa[['Drugbank','Action']])
dpa['Action'] = dpa['Action'].str.upper()
dpa['moa'] = 'other'
dpa.loc[dpa['Action'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER']),'moa'] = 'inhibitor'
dpa.loc[dpa['Action'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'activator'
dpa = dpa[['CID','Gene','moa']]
dpa['source'] = 'DrugBank'

moa = pd.read_parquet('./OT_2024.9/mechanismOfAction.parquet')
moa['moa'] = 'other'
moa.loc[moa['actionType'].isin(['INHIBITOR', 'ANTAGONIST', 'BLOCKER', 'NEGATIVE ALLOSTERIC MODULATOR',
                                'ANTISENSE INHIBITOR', 'RELEASING AGENT', 'ALLOSTERIC ANTAGONIST', 'INVERSE AGONIST',
                                'NEGATIVE MODULATOR', 'DEGRADER']),'moa'] = 'inhibitor'
moa.loc[moa['actionType'].isin(['AGONIST', 'OPENER', 'ACTIVATOR', 
                                'POSITIVE ALLOSTERIC MODULATOR', 'POSITIVE MODULATOR', 
                                'PARTIAL AGONIST']),'moa'] = 'activator'
moa = moa[['moa','chemblIds','targets']]
moa = moa.explode('chemblIds')
moa = moa.explode('targets')
moa = moa.merge(pd.read_parquet('./OT_2024.9/targets.parquet')[['id','approvedSymbol']].rename({'id':'targets'},axis=1))
moa = moa.rename({'chemblIds':'id'},axis=1).drop_duplicates()
moa = moa[['id','approvedSymbol','moa']].dropna().drop_duplicates().set_axis(['CHEMBL','Gene','moa'],axis=1)
moa = dc.merge(moa, on=['CHEMBL'])[['CID','Gene','moa']]
moa['source'] = 'Open Targets'

#

targets = pd.concat([dpa,moa,ndr,ct]).drop_duplicates(['CID','Gene'])
targets.to_pickle('./Raw files/drug_targets.pkl')

### MedDRA to phecode mappings

In [None]:
# Extracting phecode indications

umls = pd.read_csv('./Raw files/2024AB_MRCONSO.RRF', sep='|', header=None)[[0,11,13]]
si = pd.read_csv('./Raw files/meddra_all_indications.tsv', sep='\t', header=None)
temp = si.loc[si[1] != si[5]]
temp[1] = temp[5].copy()
si = pd.concat([si[[0,1]], temp[[0,1]]]).drop_duplicates()

#

inc = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')['phecode'].sort_values().to_list()
p1 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_CM.csv')
p2 = pd.read_csv('./Phecode/phecodeX_unrolled_ICD_WHO.csv')
comb = pd.concat([p1,p2])
comb = comb.loc[comb['phecode'].isin(inc)]
comb = comb.rename({'ICD':'code'},axis=1)
comb.loc[comb['vocabulary_id'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
comb.loc[comb['vocabulary_id'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
comb = comb[['phecode','code']]

umls_icd = umls.loc[umls[11].str.contains('ICD9|ICD10')]
umls_icd = umls_icd.merge(comb, left_on=13, right_on='code')[[0,'phecode']].drop_duplicates()

umls_hpo = umls.loc[umls[11] == 'HPO']
umls_hpo[13] = umls_hpo[13].str.replace('HP:','').str.lstrip('0').astype(int)
inc = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')['phecode'].sort_values().to_list()
hp = pd.read_pickle('./Raw files/hpo_phecodex_map.pkl')[['phecode','id']]
hp['code'] = hp['id'].str.replace('HP_', '').str.lstrip('0').astype(int)
hp = hp[['code','phecode']]
hp = hp.loc[hp['phecode'].isin(inc)]
umls_hpo = umls_hpo.merge(hp, left_on=13, right_on='code')[[0,'phecode']].drop_duplicates()

umls_map = pd.concat([umls_hpo,umls_icd]).drop_duplicates()

#

si = si.merge(umls_map, left_on=1, right_on=0)[['0_x','phecode']].set_axis(['CID','phecode'],axis=1)
si.to_pickle('./Raw files/drug_phecode.pkl')

### Combining files

In [None]:
sider = pd.read_pickle('./Raw files/drug_targets.pkl')
sider = sider.merge(pd.read_pickle('./Raw files/drug_phecode.pkl'), how='left').drop_duplicates()
sider = sider.drop('source',axis=1).set_axis(['id','gene','moa','phecode'],axis=1)

drug_atc = pd.read_csv('./Raw files/drug_atc.tsv', sep='\t', header=None)
drug_atc = drug_atc.loc[drug_atc[1].str.startswith('J')]
sider = sider.loc[~sider['id'].isin(drug_atc[0])]

sider.to_pickle('./Cleaned files/sider.pkl')

## Genetic associations

In [None]:
# Binary encoding (significant or not)

apd = pd.read_pickle('./Raw files/apd.pkl')
apd = apd.loc[((apd['cat'] == 'genome') & (apd['eqtl_gene'].notna())) | (apd['cat'] != 'genome')]
apd = apd.loc[((apd['cat'] == 'exome') & (apd['FUNCTION'].isin(['missense','PTV']))) | (apd['cat'] != 'exome')]

apd['gene'] = apd['eqtl_gene'].copy()
apd = apd.reset_index(drop=True)
apd.loc[apd['cat'] != 'genome', 'gene'] = apd['gene'].fillna(apd['GENE'])
apd = apd.drop_duplicates(['phecode','cat','type','gene'])
apd = apd[['phecode','cat','type','gene']]
apd['value'] = 1
apd['type'] = apd['type'].map({'phecode':'p','continuous':'c','binary':'b'})
apd['cat'] = apd['cat'].map({'exome':'e','genome':'g','gb':'b'})
apd['col'] = apd['type'] + '_' + apd['cat']
apd = pd.pivot_table(apd, index=['phecode','gene'], columns='col', values='value').reset_index()
apd.to_pickle('./Cleaned files/apd.pkl')

In [None]:
# Continuous -log10(p-value) encoding

apdz = pd.read_pickle('./Raw files/apd_z.pkl')
apdz = apdz.loc[((apdz['cat'] == 'genome') & (apdz['eqtl_gene'].notna())) | (apdz['cat'] != 'genome')]
apdz = apdz.loc[((apdz['cat'] == 'exome') & (apdz['FUNCTION'].isin(['missense','PTV']))) | (apdz['cat'] != 'exome')]

apdz['gene'] = apdz['eqtl_gene'].copy()
apdz = apdz.reset_index(drop=True)
apdz.loc[apdz['cat'] != 'genome', 'gene'] = apdz['gene'].fillna(apdz['GENE'])
apdz = apdz.sort_values('LOG10P', ascending=False).drop_duplicates(['phecode','cat','type','gene'])
apdz = apdz[['phecode','cat','type','gene','LOG10P']]
apdz['type'] = apdz['type'].map({'phecode':'p','continuous':'c','binary':'b'})
apdz['cat'] = apdz['cat'].map({'exome':'e','genome':'g','gb':'b'})
apdz['col'] = apdz['type'] + '_' + apdz['cat'] + '_z'
apdz = pd.pivot_table(apdz, index=['phecode','gene'], columns='col', values='LOG10P').reset_index()
apdz.to_pickle('./Cleaned files/apdz.pkl')

In [None]:
# Continuous -log10(p-value) encoding with DOE

apdz = pd.read_pickle('Raw files/apd_z.pkl')
apdz = apdz.loc[apdz['cat'] != 'exome']
apdz = apdz.loc[((apdz['cat'] == 'genome') & (apdz['eqtl_gene'].notna())) | (apdz['cat'] != 'genome')]

# Here we took the most significant variant for each of activator, inhibitor, or neutral. We used LoGoFunc to generate GOF and LOF predictions.
# GOF & beta > 0 = inhibitor
# GOF & beta < 0 = activator
# LOF & beta > 0 = activator
# LOF & beta < 0 = inhibitor

apdz_exome = pd.read_pickle('/Raw files/apd_z_exome_DOE.pkl')
apdz = pd.concat([apdz,apdz_exome])
apdz = apdz[['phecode','cat','type','ID','BETA','LOG10P','GENE','FUNCTION','eqtl_gene','slope','effect']]

apdz['gene'] = apdz['eqtl_gene'].copy()
apdz = apdz.reset_index(drop=True)
apdz.loc[apdz['cat'] != 'genome', 'gene'] = apdz['gene'].fillna(apdz['GENE'])

apdz.loc[(apdz['cat'] == 'gb') & (apdz['BETA'] > 0), 'effect'] = 1
apdz.loc[(apdz['cat'] == 'gb') & (apdz['BETA'] < 0), 'effect'] = -1

apdz.loc[(apdz['slope'] < 0) & (apdz['BETA'] > 0), 'effect'] = 1
apdz.loc[(apdz['slope'] > 0) & (apdz['BETA'] < 0), 'effect'] = 1
apdz.loc[(apdz['slope'] > 0) & (apdz['BETA'] > 0), 'effect'] = -1
apdz.loc[(apdz['slope'] < 0) & (apdz['BETA'] < 0), 'effect'] = -1

apdz['effect'] = apdz['effect'].fillna(0)

apdz = apdz.sort_values(['LOG10P'], ascending=False).drop_duplicates(['phecode','cat','type','gene','effect'])
apdz = apdz[['phecode','cat','type','gene','effect','LOG10P']].dropna()

apdz['type'] = apdz['type'].map({'phecode':'p','continuous':'c','binary':'b'})
apdz['cat'] = apdz['cat'].map({'exome':'e','genome':'g','gb':'b'})
apdz['effect'] = apdz['effect'].map({1:'act',-1:'inh',0:'neu'})
apdz['col'] = apdz['type'] + '_' + apdz['cat'] + '_' + apdz['effect']

apdz = pd.pivot_table(apdz, index=['phecode','gene'], columns='col', values='LOG10P').reset_index()
apdz.fillna(0).to_pickle('./Final/apdz_dir.pkl')