In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [None]:
# SNOMED and MRCONSO need to be downloaded from NIH website

snomed = pd.read_csv('./Resources/der2_iisssccRefset_ExtendedMapFull_US1000124_20250901.txt', sep='\t')
snomed = snomed.loc[~snomed['mapTarget'].isna()].sort_values('referencedComponentId')
snomed = snomed[['referencedComponentId','mapGroup','mapPriority','mapRule','mapAdvice','mapTarget']]
snomed = snomed.loc[snomed['mapRule'] == 'TRUE']
snomed.to_pickle('./Resources/snomed_20250901.pkl')

###

pd.read_csv('./Resources/MRCONSO.RRF', sep='|', header=None)[[0,11,13,14]].to_pickle('./Resources/2025AB_MRCONSO.pkl')

###

# ! rm -rf ./Resources/der2_iisssccRefset_ExtendedMapFull_US1000124_20250901.txt
# ! rm -rf ./Resources/MRCONSO.RRF

In [2]:
pi = pd.read_csv('./Resources/phecodeX_info.csv')
pi['phecodeX_code'] = pi['phecode'].str.split('_').str[1]

ses = pd.read_csv('./Resources/hpo-phecodeX_linkswithHPOchildrenExpansion_StrongEvidenceSpecific.tsv', sep='\t')
ses['phecodeX_code'] = ses['phecodeX_code'].astype(str)
mp = pd.read_csv('./Resources/ChildHPO_Precision_X.tsv', sep='\t')
mp['phecodeX_code'] = mp['phecodeX_code'].astype(str)

op = pd.concat([ses[['phecodeX_code','phecodeX_label','phecodeX_category']], mp[['phecodeX_code','phecodeX_label','phecodeX_category']]])
op = op.drop_duplicates()
op = op.merge(pi[['phecode','phecode_string']].rename({'phecode_string':'phecodeX_label'},axis=1), how='left')
up = op.loc[op['phecode'].isna()].drop(['phecode'],axis=1)
op = op.loc[op['phecode'].notna()]
up = up.merge(pi[['phecode','phecodeX_code','phecode_string']], how='inner')
up = up.loc[~up['phecodeX_code'].isin(['962.11','324.6','352.2','374.7','375.113','765.3','767.1',
                                       '750.12','751.2','757.2','772.1','772.2','978','978.3',
                                       '973','975','975.2','705.12','618','582.1','582.2','582.3',
                                       '618.1','626.4','374.4','200.11','751.4','168.3','962',
                                       '962.1','771','754','752','753.3','977','775','770.4',
                                       '770.3','772','771.1','771.3','976','969'])]
up = up.drop('phecode_string',axis=1)
op = pd.concat([op,up]).drop_duplicates()
op = op.merge(pi[['phecode','phecode_string']])

ses = ses.merge(op)[['phecode','hpo_code','phecode_string','hpo_label','hpo_child','StrongEvidenceSpecific','StrongEvidenceBroad']]
mp = mp.merge(op)[['phecode','hpo_code','phecode_string','hpo_label','hpo_child','StrongEvidenceSpecific','StrongEvidenceBroad']]
mp = mp.loc[(mp['StrongEvidenceBroad'] == True) | (mp['StrongEvidenceSpecific'] == True)]
map = pd.concat([ses,mp]).drop_duplicates(['phecode','hpo_code']).rename({'hpo_code':'id'},axis=1)
map.to_pickle('./Resources/hpo_phecodex_map.pkl')

## Cleaning raw files

In [3]:
def extract_drugbank(refs):
    if not isinstance(refs, (list, tuple)):
        return None
    for d in refs:
        if isinstance(d, dict) and str(d.get('source', '')).lower() == 'drugbank':
            ids = d.get('ids') or d.get('id') or d.get('identifier') or d.get('value')
            return ids[0] if isinstance(ids, (list, tuple)) else ids
    return None

drugs = pd.read_pickle("./OT_Raw/molecule.pkl")
drugs["drugbank"] = drugs["crossReferences"].apply(extract_drugbank)

drugs = drugs[[
    "id","blackBoxWarning","name","drugbank",
    "maximumClinicalTrialPhase","isApproved",
    "synonyms","crossReferences","description"
]]
drugs["name"] = drugs["name"].str.lower()
drugs.to_pickle("./OT_Final/molecule_cleaned.pkl")

In [7]:
p1 = pd.read_csv('./Resources/phecodeX_unrolled_ICD_CM.csv')
p2 = pd.read_csv('./Resources/phecodeX_unrolled_ICD_WHO.csv')
comb = pd.concat([p1,p2])
comb = comb.rename({'ICD':'code'},axis=1)
comb.loc[comb['vocabulary_id'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
comb.loc[comb['vocabulary_id'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
comb['code'] = comb['code'].astype(str)
comb_null = comb.copy()
comb_null['code'] = comb_null['code'].str.replace('.','', regex=False)

hp = pd.read_pickle('./Resources/hpo_phecodex_map.pkl')[['id','phecode']].drop_duplicates()
hp['code'] = hp['id'].str.split('_').str[1].astype(str)
hp = hp[['phecode','code']]
hp['terminology'] = 'HP'

umls = pd.read_pickle('./Resources/2025AB_MRCONSO.pkl')[[0,11,13]]
umls_icd = umls.loc[umls[11].str.contains('ICD9|ICD10')]
umls_icd = umls_icd.merge(comb, left_on=13, right_on='code')[[0,'phecode']].drop_duplicates()
umls_other = umls.loc[~umls[11].str.contains('ICD9|ICD10')]
umls_other = umls_other.merge(umls_icd, on=0)
umls_other = umls_other[[13,11,'phecode']].rename({13:'code',11:'umls_terminology'},axis=1).drop_duplicates()
umls_icd = umls_icd.set_axis(['code','phecode'],axis=1)
umls_icd['umls_terminology'] = 'UMLS'
umls = pd.concat([umls_icd,umls_other])

snomed = pd.read_pickle('./Resources/snomed_20250901.pkl')
snomed = snomed.merge(comb[['phecode','code']], left_on='mapTarget', right_on='code')
snomed = snomed[['referencedComponentId','phecode']].drop_duplicates().set_axis(['code','phecode'],axis=1)
snomed['code'] = snomed['code'].astype(str)
snomed['terminology'] = 'SNOMEDCT'

disease = pd.read_pickle("./OT_Raw/diseases.pkl")[['id','dbXRefs','name']]
disease = disease.explode('dbXRefs').dropna(subset='dbXRefs')
disease['terminology'] = disease['dbXRefs'].str.split(':').str[0]
disease['code'] = disease['dbXRefs'].str.split(':').str[1].astype(str)
disease = disease[['id','name','terminology','code']]
disease.loc[disease['terminology'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
disease.loc[disease['terminology'].str.contains('ICD10'), 'icd_type'] = 'ICD10'

disease1 = disease.merge(comb, on=['code','icd_type'], how='left')
disease2 = disease.merge(hp, on=['code','terminology'], how='left')
disease3 = disease.merge(snomed, on=['code','terminology'], how='left')

disease_umls = disease.copy()
disease_umls = disease_umls.merge(umls)
check = pd.read_excel('./Resources/umls_equivalence.xlsx').dropna()
disease_umls = disease_umls.merge(check)
disease_umls = disease_umls[['id','name','terminology','code','phecode']]

temp = disease1.loc[(disease1['phecode'].isna()) & (disease1['icd_type'].notna())].drop(['phecode','vocabulary_id'],axis=1)
temp['code'] = temp['code'].str.replace('*','').str.replace('+','')
temp = temp.merge(comb, on=['code','icd_type'], how='left')
disease1 = pd.concat([disease1, temp])

temp = temp.loc[(temp['phecode'].isna()) & (temp['icd_type'].notna())].drop(['phecode','vocabulary_id'],axis=1)
temp = temp.loc[~temp['code'].str.contains('-')]
temp['code'] = temp['code'].str.replace('.','')
temp = temp.merge(comb_null, on=['code','icd_type'], how='left')
disease1 = pd.concat([disease1, temp])

temp = temp.loc[(temp['phecode'].isna()) & (temp['icd_type'].notna())].drop(['phecode','vocabulary_id'],axis=1)
temp['code'] = temp['code'].str[:-1]
temp = temp.merge(comb_null, on=['code','icd_type'], how='left')
disease1 = pd.concat([disease1, temp])

disease = pd.concat([disease1,disease2,disease3,disease_umls]).drop(['icd_type','vocabulary_id'],axis=1).drop_duplicates()
disease.to_pickle('./OT_Final/diseases_cleaned.pkl')


In [10]:
indications = pd.read_pickle("./OT_Raw/indication.pkl")

exploded_indications = indications[['id','approvedIndications']].explode('approvedIndications').dropna()
exploded_indications = exploded_indications.rename({'approvedIndications':'disease'},axis=1)
exploded_indications['maxPhaseForIndication'] = 4

# Function to process each row and extract the desired information
def process_row(row):
    id_value = row['id']
    result_rows = []
    for indication in row['indications']:
        disease = indication['disease']
        max_phase = indication['maxPhaseForIndication']
        result_rows.append({'id': id_value, 'disease': disease, 'maxPhaseForIndication': max_phase})
    return result_rows

# Apply the function to each row and create a list of new rows
new_rows = []
for _, row in indications.iterrows():
    new_rows.extend(process_row(row))

# Create a new DataFrame from the list of new rows
expanded_df = pd.DataFrame(new_rows)

indications = pd.concat([expanded_df, exploded_indications])
indications = indications.rename({'maxPhaseForIndication':'phase'},axis=1)
indications = indications.sort_values(['id','disease','phase'], ascending=[True,True,False])
indications = indications.drop_duplicates().reset_index(drop=True)

map = pd.read_pickle('./OT_Final/diseases_cleaned.pkl')
map['code'] = map['code'].astype(str)

phecode_map = map.loc[map['phecode'].notna()][['id','phecode']]

mondo_map = map.loc[map['terminology'] == 'MONDO'][['code','id']]
mondo_map = mondo_map.merge(phecode_map)
mondo_map['id'] = 'MONDO_' + mondo_map['code'].astype(str)
mondo_map = mondo_map[['id','phecode']]

hp_map = map.loc[map['terminology'] == 'HP'][['code','id']]
hp_map = hp_map.merge(phecode_map)
hp_map['id'] = 'HP_' + hp_map['code'].astype(str)
hp_map = hp_map[['id','phecode']]

orpha_map = map.loc[map['terminology'] == 'Orphanet'][['code','id']]
orpha_map = orpha_map.merge(phecode_map)
orpha_map['id'] = 'Orphanet_' + orpha_map['code'].astype(str)
orpha_map = orpha_map[['id','phecode']]

efo_map = map.loc[map['terminology'] == 'EFO'][['code','id']]
efo_map = efo_map.merge(phecode_map)
efo_map['id'] = 'EFO_' + efo_map['code'].astype(str)
efo_map = efo_map[['id','phecode']]

hp = pd.read_pickle('./Resources/hpo_phecodex_map.pkl')[['id','phecode']].drop_duplicates()

eva_map = pd.concat([hp, phecode_map, mondo_map, hp_map, orpha_map, efo_map]).drop_duplicates(['id','phecode'])
eva_map = eva_map.rename({'id':'disease'},axis=1)

indications = indications.merge(eva_map)
indications = indications.groupby(['id','disease','phecode'])['phase'].max().reset_index()
indications.to_pickle('./OT_Final/indication_cleaned.pkl')


## HGMD

In [9]:
# Mapping HGMD

hid = pd.read_csv('./Resources/HGMD_HPO_Parent_and_Root_2023_4_Mort.csv')
hid = hid.assign(parentAndRoot_HPO_codes=hid['parentAndRoot_HPO_codes'].str.split(',')).explode('parentAndRoot_HPO_codes')
hid['code'] = hid['parentAndRoot_HPO_codes'].str.replace('HP:','').str.lstrip('0')
hid = hid[['acc_num','code']]
hid['code'] = hid['code'].astype(str)

#####

hp = pd.read_pickle('./Resources/hpo_phecodex_map.pkl')[['phecode','id']]
hp['code'] = hp['id'].str.replace('HP_', '').str.lstrip('0').astype(int).astype(str)
hp = hp[['code','phecode']].drop_duplicates()

hid = hid.merge(hp).drop_duplicates().rename({'acc_num':'ID'},axis=1)

#####

df = pd.read_csv('./Resources/2023_4_hg38_fullinfo.tsv', sep='\t')
def parse_info(info_string):
    info_dict = {}
    for item in info_string.split(';'):
        if '=' in item:
            key, value = item.split('=', 1)
            info_dict[key] = value.strip('"')
    return info_dict
parsed_info = df['INFO'].apply(parse_info)
df_parsed = pd.DataFrame(parsed_info.tolist())

df = df.join(df_parsed)[['CHROM','POS','REF','ALT','DNA','ID','CLASS','GENE']]

#####

hgmd = df.merge(hid)
hgmd.to_pickle('./OT_Final/hgmd.pkl')

  df = pd.read_csv('./Resources/2023_4_hg38_fullinfo.tsv', sep='\t')


## OMIM

In [10]:
p1 = pd.read_csv('./Resources/phecodeX_unrolled_ICD_CM.csv')
p2 = pd.read_csv('./Resources/phecodeX_unrolled_ICD_WHO.csv')
comb = pd.concat([p1,p2])
comb = comb.rename({'ICD':'code'},axis=1)
comb.loc[comb['vocabulary_id'].str.contains('ICD9'), 'icd_type'] = 'ICD9'
comb.loc[comb['vocabulary_id'].str.contains('ICD10'), 'icd_type'] = 'ICD10'
comb = comb[['phecode','code']]

umls = pd.read_pickle('./Resources/2025AB_MRCONSO.pkl')[[0,11,13]]

umls_icd = umls.loc[umls[11].str.contains('ICD9|ICD10')]
umls_icd = umls_icd.merge(comb, left_on=13, right_on='code')[[0,'phecode']].drop_duplicates()

umls_omim = umls.loc[umls[11] == 'OMIM']
umls_omim = umls_omim.loc[~umls_omim[13].str.contains('MTHU')]
umls_omim = umls_omim.loc[~umls_omim[13].str.contains('.', regex=False)]
umls_omim[13] = umls_omim[13].astype(int)
umls_omim = umls_omim[[0,13]].merge(umls_icd, on=0)
umls_omim = umls_omim[[13,'phecode']].set_axis(['code','phecode'],axis=1)

map = pd.read_pickle('./OT_Final/diseases_cleaned.pkl')
map = map.loc[map['terminology'].str.contains('OMIM')]
map = map.loc[map['phecode'].notna()][['code','phecode']]
map['code'] = map['code'].astype(int)
map = pd.concat([map,umls_omim]).drop_duplicates()

df = pd.read_csv('./Resources/genemap2_20251114.txt', sep='\t', skiprows=3)[['Approved Gene Symbol','Phenotypes']].dropna()
df['Phenotypes'] = df['Phenotypes'].str.split(';')
df = df.explode('Phenotypes')
df = df.loc[df['Phenotypes'].str.contains('(3)')]

df['MIM'] = df['Phenotypes'].str.extract(r',\s*(\d+)\s*\(')
df['MIM'] = pd.to_numeric(df['MIM'], errors='coerce')
df = df[['Approved Gene Symbol','MIM']].set_axis(['Gene','MIM'],axis=1).dropna()
df['MIM'] = df['MIM'].astype(int)
df = df.rename({'MIM':'code'},axis=1)
df = df.merge(map)
df = df[['Gene','phecode']].drop_duplicates()
df.to_pickle('./OT_Final/omim.pkl')

  df = df.loc[df['Phenotypes'].str.contains('(3)')]


## OT Evidence

In [3]:
map = pd.read_pickle('./OT_Final/diseases_cleaned.pkl')
map['code'] = map['code'].astype(str)

phecode_map = map.loc[map['phecode'].notna()][['id','phecode']]

mondo_map = map.loc[map['terminology'] == 'MONDO'][['code','id']]
mondo_map = mondo_map.merge(phecode_map)
mondo_map['id'] = 'MONDO_' + mondo_map['code'].astype(str)
mondo_map = mondo_map[['id','phecode']]

hp_map = map.loc[map['terminology'] == 'HP'][['code','id']]
hp_map = hp_map.merge(phecode_map)
hp_map['id'] = 'HP_' + hp_map['code'].astype(str)
hp_map = hp_map[['id','phecode']]

orpha_map = map.loc[map['terminology'] == 'Orphanet'][['code','id']]
orpha_map = orpha_map.merge(phecode_map)
orpha_map['id'] = 'Orphanet_' + orpha_map['code'].astype(str)
orpha_map = orpha_map[['id','phecode']]

efo_map = map.loc[map['terminology'] == 'EFO'][['code','id']]
efo_map = efo_map.merge(phecode_map)
efo_map['id'] = 'EFO_' + efo_map['code'].astype(str)
efo_map = efo_map[['id','phecode']]

hp = pd.read_pickle('./Resources/hpo_phecodex_map.pkl')[['phecode','id']]

eva_map = pd.concat([phecode_map, mondo_map, hp_map, orpha_map, efo_map, hp]).drop_duplicates(['id','phecode'])
eva_map = eva_map.rename({'id':'diseaseId'},axis=1)

#####

target = pd.read_pickle('./OT_Raw/targets.pkl')[['id','approvedSymbol']]
target = target.set_axis(['targetId','gene'],axis=1)


In [12]:
dbs = pd.read_pickle('./OT_Raw/direct.pkl')
dbs = dbs.merge(target)
dbs = dbs.merge(eva_map)
dbs = dbs.sort_values('score',ascending=False).drop_duplicates(['phecode','gene','datasourceId','score','evidenceCount'])
dbs = pd.pivot_table(dbs, index=['phecode','gene'], columns='datasourceId', values='score').reset_index()
dbs.to_pickle('./OT_Final/direct_by_source.pkl')

# Not used
if False:
    dbs = pd.read_pickle('./OT_Raw/indirect.pkl')
    dbs = dbs.merge(target)
    dbs = dbs.merge(eva_map)
    dbs = dbs.sort_values('score',ascending=False).drop_duplicates(['phecode','gene','datasourceId','score','evidenceCount'])
    dbs = pd.pivot_table(dbs, index=['phecode','gene'], columns='datasourceId', values='score').reset_index()
    dbs.to_pickle('./OT_Final/indirect_by_source.pkl')


In [13]:
chembl = pd.read_pickle('./OT_Raw/chembl.pkl')
chembl = chembl.merge(target).merge(eva_map)[['datasourceId','phecode','gene','clinicalPhase',
                                              'drugId','studyStartDate','studyStopReasonCategories','score','directionOnTrait']]
chembl = chembl.rename({'clinicalPhase':'phase','drugId':'chembl_id'},axis=1)
chembl.to_pickle('./OT_Final/chembl.pkl')


In [7]:
chembl = pd.read_pickle('./OT_Raw/chembl.pkl')
chembl = chembl.merge(target).merge(eva_map)
chembl['id'] = chembl['phecode'] + ':' + chembl['gene']
chembl['active'] = chembl['clinicalStatus'].map({'Completed':0,'Terminated':0,'Recruiting':1,'Unknown status':0,
                             'Active, not recruiting':1,'Withdrawn':0,'Not yet recruiting':1,
                             'Suspended':0,'Enrolling by invitation':1,'None':0})

chembl = chembl[['drugId','id','clinicalPhase','clinicalStatus','studyId','active','studyStartDate','studyStopReasonCategories']]
chembl = chembl.set_axis(['chembl_id','id','phase','status','study_id','active','start_date','stop_reason'],axis=1)
chembl['start_date'] = pd.to_datetime(chembl['start_date'])
chembl.loc[chembl['status'].astype(str).str.contains('Completed'),'Negative'] = 2
chembl.loc[chembl['stop_reason'].astype(str).str.contains('Negative'),'Negative'] = 1
chembl.loc[chembl['status'].astype(str).str.contains('Completed'),'Safety'] = 2
chembl.loc[chembl['stop_reason'].astype(str).str.contains('Safety'),'Safety'] = 1
chembl['active'] = chembl['active'].fillna(0)

mk = pd.read_pickle('./Resources/minikel_drugs_cleaned.pkl')
mk['id'] = mk['phecode'] + ':' + mk['gene']
mk['phase'] = mk['phase'].map({'Preclinical':0.5,'Phase I':1,'Phase II':2,'Phase III':3,'Launched':4})
mk['active'] = 0
mk.loc[mk['active_max_phase'].notna(), 'active'] = 1
mk = mk[['id','phase','active','succ_p_1','succ_1_2','succ_2_3','succ_3_a']]
mk = mk.set_axis(['id','max_phase','active','p_1','1_2','2_3','3_a'],axis=1)

#####

temp = chembl[['chembl_id','id','phase','study_id','status','active','start_date']]
temp = pd.concat([temp, mk[['id','max_phase','active']].rename({'max_phase':'phase'},axis=1)])
temp = temp.reset_index(drop=True).drop_duplicates()
temp.to_pickle('./Resources/all_trials.pkl')

#####

ed = chembl.groupby('id')['start_date'].min().reset_index()
ed.to_pickle('./Resources/earliest_date.pkl')

#####

ps = chembl.groupby(['id'])[['phase','active']].max().reset_index()
ps = ps.rename({'phase':'max_phase'},axis=1)
ps = ps.merge(chembl.groupby(['id','phase'])['start_date'].min().reset_index().rename({'phase':'max_phase'},axis=1), how='left')

ps.loc[ps['max_phase'] >= 1, 'p_1'] = 1
ps.loc[(ps['max_phase'] == 0.5) & (ps['active'] == 0), 'p_1'] = 0

ps.loc[ps['max_phase'] >= 2, '1_2'] = 1
ps.loc[(ps['max_phase'] == 1) & (ps['active'] == 0), '1_2'] = 0

ps.loc[ps['max_phase'] >= 3, '2_3'] = 1
ps.loc[(ps['max_phase'] == 2) & (ps['active'] == 0), '2_3'] = 0

ps.loc[ps['max_phase'] >= 4, '3_a'] = 1
ps.loc[(ps['max_phase'] == 3) & (ps['active'] == 0), '3_a'] = 0

ps.loc[ps['max_phase'] >= 4, '1_a'] = 1
ps.loc[(ps['max_phase'].isin([1,2,3])) & (ps['active'] == 0), '1_a'] = 0

ps = pd.concat([ps,mk]).groupby('id')[['max_phase','p_1','1_2','2_3','3_a','1_a']].max().reset_index()
ps.to_pickle('./Resources/phase_success.pkl')
