In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import itertools

###

targets = pd.read_pickle('./OT/Final/targets.pkl')

# See "Diseases.ipynb" for these files
custom = pd.read_excel('./Phenotyping/custom_phenotypes.xlsx')
custom['phecode1.2'] = custom['phecode1.2'].astype(float)
icd = pd.read_csv('./Phenotyping/icd_codes.csv')
merged_pheno = pd.read_excel('./Phenotyping/merged_phenotypes.xlsx')

## Phenotype-agnostic

In [None]:
# Druggable genes only
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
genes = genes.loc[~genes['gene'].str.contains('ENSG')]
tg = pd.read_pickle('./Drugs/targets.pkl')[['gene','moa']]
tg = tg.loc[tg['gene'].isin(genes['gene'])]
tg['value'] = 1
tg = pd.pivot_table(tg, index='gene', columns='moa', values='value').fillna(0).reset_index()
tg = tg.set_axis(['gene','neg','other','pos'],axis=1)

tp = pd.read_pickle('./Features/tp.pkl').drop('tdl',axis=1)
constraint = pd.read_pickle('./Features/constraint.pkl')
ess = pd.read_pickle('./Features/ess_haplo.pkl')
tg = tg.merge(tp, on='gene', how='left').merge(constraint, on='gene', how='left').merge(ess, on='gene', how='left')
tg = tg.drop_duplicates('gene')
tg.to_pickle('./OT/Final/gene_direction.pkl')


In [None]:
# All genes
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
genes = genes.loc[~genes['gene'].str.contains('ENSG')]
genes = genes.merge(pd.read_pickle('./Drugs/drug_genes.pkl'), how='left')
genes = genes.merge(pd.read_csv('./Other/pharos.csv')[['Symbol','Target Development Level']].set_axis(['gene','tdl'],axis=1), how='left')
genes = genes.drop_duplicates('gene')
genes['tdl'] = genes['tdl'].fillna('Tdark')

genes['tclin_tchem'] = 0
genes.loc[genes['tdl'].isin(['Tchem','Tclin']), 'tclin_tchem'] = 1
genes['druggable'] = 0
genes.loc[(genes['source'].notna()) | (genes['tdl'].isin(['Tchem','Tclin'])), 'druggable'] = 1

tg = pd.read_pickle('./Drugs/targets.pkl')[['gene','moa']]
tg['value'] = 1
tg = pd.pivot_table(tg, index='gene', columns='moa', values='value').fillna(0).reset_index()
tg = tg.set_axis(['gene','neg','other','pos'],axis=1)
genes = genes.merge(tg, how='left')
genes[['neg','pos','other']] = genes[['neg','pos','other']].fillna(0)

tp = pd.read_pickle('./Features/tp.pkl').drop('tdl',axis=1)
constraint = pd.read_pickle('./Features/constraint.pkl')
ess = pd.read_pickle('./Features/ess_haplo.pkl')
genes = genes.merge(tp, on='gene', how='left').merge(constraint, on='gene', how='left').merge(ess, on='gene', how='left')
genes = genes.drop_duplicates('gene')
genes.to_pickle('./OT/Final/gene_direction_all.pkl')


In [None]:
# Subset analyses
subsets = pd.read_pickle('./OT/Final/gene_direction_all.pkl')
subsets.loc[subsets['lof.oe_ci.upper'] >= 0.6, 'not_constrained'] = 1
subsets.loc[subsets['lof.oe_ci.upper'] < 0.6, 'constrained'] = 1
subsets = subsets.merge(pd.read_csv('./Other/pharos.csv')[['Symbol','Novelty']].set_axis(['gene','novelty'],axis=1))
subsets.loc[subsets['novelty'] >= subsets['novelty'].median(), 'novel'] = 1
print(subsets['novelty'].median())
subsets = subsets[['gene','class_enzyme','class_transporter','class_tf','class_gpcr','class_vgic','constrained','not_constrained','novel']]
subsets.to_pickle('./OT/Final/subsets.pkl')


## Phenotype-specific

### Only druggable genes

In [None]:
# Load all dataframes
df0 = pd.read_pickle('./Drugs/combined.pkl')[['Code','gene']]
df1 = pd.read_pickle('./Features/finngen_eqtl.pkl')
df2 = pd.read_pickle('./Features/l2g.pkl')
df3 = pd.read_pickle('./Features/mvp_eqtl.pkl')
df4 = pd.read_pickle('./Features/panukbb_eqtl.pkl')
df5 = pd.read_pickle('./Features/finngen_sv.pkl')
df6 = pd.read_pickle('./Features/genebass_sv.pkl')
df8 = pd.read_pickle('./Features/finngen_gb.pkl')
df9 = pd.read_pickle('./Features/genebass_gb.pkl')
df10 = pd.read_pickle('./Features/jurgens.pkl')
df11 = pd.read_pickle('./Features/ot_gb.pkl')
df12 = pd.read_pickle('./Features/eva.pkl')
df13 = pd.read_pickle('./Features/orphanet_g2p.pkl')
df14 = pd.read_pickle('./Features/impc.pkl')
df15 = pd.read_pickle('./Features/finngen_eqtl_closest.pkl')
df16 = pd.read_pickle('./Features/mvp_eqtl_closest.pkl')
df17 = pd.read_pickle('./Features/panukbb_eqtl_closest.pkl')

dfs = [df0, df1, df2, df3, df4, 
       df5, df6, df8, df9, df10,
       df11, df12, df13, df14,
       df15, df16, df17]
result = dfs[0]
for df in dfs[1:]:
    result = result.merge(df, on=["Code", "gene"], how="left")

result = result.replace({np.inf: 325, -np.inf: -325})
result.to_pickle('./Training/input.pkl')


In [None]:
# Keep only screened phenotypes
keep = pd.read_excel('./Phenotyping/nonspecific_codes.xlsx')
keep = keep.loc[keep['Include'] == 'y']
keep['Nonspecific'] = keep['Nonspecific'].fillna(0)
db = pd.read_excel('./Phenotyping/present_pheno.xlsx')
input = pd.read_pickle('./Training/input.pkl')
input = input.merge(db).merge(keep[['Code','Category','Nonspecific']].drop_duplicates())
input['Category'] = pd.Categorical(input['Category'])

# Where pheno present and value missing, fill with 0
for col in ['eva_hclof', 'eva_lclof', 'eva_gof', 'impc_lof', 'l2g_activator', 'l2g_inhibitor', 
            'ot_gene_burden', 'orphanet_g2p_lof', 'eva_other_missense']:
    input.loc[(input['OT'] == 1) & (input[col].isna()), col] = 0
input.loc[input['MVP'] == 1, input.columns.str.contains('mvp_')] = input.loc[input['MVP'] == 1, input.columns.str.contains('mvp_')].fillna(0)
input.loc[input['FinnGen'] == 1, input.columns.str.contains('finngen_')] = input.loc[input['FinnGen'] == 1, input.columns.str.contains('finngen_')].fillna(0)
input.loc[input['Jurgens'] == 1, input.columns.str.contains('jurgens_')] = input.loc[input['Jurgens'] == 1, input.columns.str.contains('jurgens_')].fillna(0)
input.loc[input['PanUKBB'] == 1, input.columns.str.contains('panukbb_')] = input.loc[input['PanUKBB'] == 1, input.columns.str.contains('panukbb_')].fillna(0)
input.loc[input['Genebass'] == 1, input.columns.str.contains('genebass_')] = input.loc[input['Genebass'] == 1, input.columns.str.contains('genebass_')].fillna(0)
input = input.drop(['OT','Jurgens','MVP','PanUKBB','Genebass','FinnGen'],axis=1)

input['locus2gene'] = input['l2g_activator'].copy()
input.loc[input['l2g_inhibitor'] > input['l2g_activator'], 'locus2gene'] = -1*input['l2g_inhibitor']
input['panukbb_eqtl'] = input['panukbb_eqtl_closest_opposite'].copy()
input.loc[input['panukbb_eqtl_closest_same'] > input['panukbb_eqtl_closest_opposite'], 'panukbb_eqtl'] = -1*input['panukbb_eqtl_closest_same']
input['finngen_eqtl'] = input['finngen_eqtl_closest_opposite'].copy()
input.loc[input['finngen_eqtl_closest_same'] > input['finngen_eqtl_closest_opposite'], 'finngen_eqtl'] = -1*input['finngen_eqtl_closest_same']
input['mvp_eqtl'] = input['mvp_eqtl_closest_opposite'].copy()
input.loc[input['mvp_eqtl_closest_same'] > input['mvp_eqtl_closest_opposite'], 'mvp_eqtl'] = -1*input['mvp_eqtl_closest_same']

constraint = pd.read_pickle('./Features/constraint.pkl')[['gene','lof.oe_ci.upper_bin_decile']]
input = input.merge(constraint, on='gene', how='left')

ns = pd.read_excel('./Phenotyping/nonspecific_codes.xlsx')[['Code','Nonspecific']].fillna(0)
input = input.merge(ns)
input = input.drop_duplicates(['Code','gene'])
input.to_pickle('./Training/input_extended.pkl')


### All supported genes

In [None]:
keep = pd.read_excel('./Phenotyping/nonspecific_codes.xlsx')
keep = keep.loc[keep['Include'] == 'y']
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')
genes = genes.loc[~genes['gene'].str.contains('ENSG')]

combinations = list(itertools.product(keep['Code'], genes['gene']))
df0 = pd.DataFrame(combinations, columns=['Code', 'gene'])

df1 = pd.read_pickle('./Features/finngen_eqtl.pkl')
df2 = pd.read_pickle('./Features/l2g.pkl')
df3 = pd.read_pickle('./Features/mvp_eqtl.pkl')
df4 = pd.read_pickle('./Features/panukbb_eqtl.pkl')
df5 = pd.read_pickle('./Features/finngen_sv.pkl')
df6 = pd.read_pickle('./Features/genebass_sv.pkl')
df8 = pd.read_pickle('./Features/finngen_gb.pkl')
df9 = pd.read_pickle('./Features/genebass_gb.pkl')
df10 = pd.read_pickle('./Features/jurgens.pkl')
df11 = pd.read_pickle('./Features/ot_gb.pkl')
df12 = pd.read_pickle('./Features/eva.pkl')
df13 = pd.read_pickle('./Features/orphanet_g2p.pkl')
df14 = pd.read_pickle('./Features/impc.pkl')
df15 = pd.read_pickle('./Features/finngen_eqtl_closest.pkl')
df16 = pd.read_pickle('./Features/mvp_eqtl_closest.pkl')
df17 = pd.read_pickle('./Features/panukbb_eqtl_closest.pkl')

dfs = [df0, df1, df2, df3, df4,
       df5, df6, df8, df9, df10,
       df11, df12, df13, df14,
       df15, df16, df17]
result = dfs[0]
for df in dfs[1:]:
    result = result.merge(df, on=["Code", "gene"], how="left")

result = result.replace({np.inf: 325, -np.inf: -325})
result.to_pickle('./Training/input_all.pkl')


In [None]:
# Keep only screened phenotypes
keep = pd.read_excel('./Phenotyping/nonspecific_codes.xlsx')
keep = keep.loc[keep['Include'] == 'y']
keep['Nonspecific'] = keep['Nonspecific'].fillna(0)
db = pd.read_excel('./Phenotyping/present_pheno.xlsx')
input = pd.read_pickle('./Training/input_all.pkl')
input = input.merge(db).merge(keep[['Code','Category','Nonspecific']].drop_duplicates())
input['Category'] = pd.Categorical(input['Category'])

# Where pheno present and value missing, fill with 0
for col in ['eva_hclof', 'eva_lclof', 'eva_gof', 'impc_lof', 'l2g_activator', 'l2g_inhibitor', 
            'ot_gene_burden', 'orphanet_g2p_lof', 'eva_other_missense']:
    input.loc[(input['OT'] == 1) & (input[col].isna()), col] = 0
input.loc[input['MVP'] == 1, input.columns.str.contains('mvp_')] = input.loc[input['MVP'] == 1, input.columns.str.contains('mvp_')].fillna(0)
input.loc[input['FinnGen'] == 1, input.columns.str.contains('finngen_')] = input.loc[input['FinnGen'] == 1, input.columns.str.contains('finngen_')].fillna(0)
input.loc[input['Jurgens'] == 1, input.columns.str.contains('jurgens_')] = input.loc[input['Jurgens'] == 1, input.columns.str.contains('jurgens_')].fillna(0)
input.loc[input['PanUKBB'] == 1, input.columns.str.contains('panukbb_')] = input.loc[input['PanUKBB'] == 1, input.columns.str.contains('panukbb_')].fillna(0)
input.loc[input['Genebass'] == 1, input.columns.str.contains('genebass_')] = input.loc[input['Genebass'] == 1, input.columns.str.contains('genebass_')].fillna(0)
input = input.drop(['OT','Jurgens','MVP','PanUKBB','Genebass','FinnGen'],axis=1)

input['locus2gene'] = input['l2g_activator'].copy()
input.loc[input['l2g_inhibitor'] > input['l2g_activator'], 'locus2gene'] = -1*input['l2g_inhibitor']
input['panukbb_eqtl'] = input['panukbb_eqtl_closest_opposite'].copy()
input.loc[input['panukbb_eqtl_closest_same'] > input['panukbb_eqtl_closest_opposite'], 'panukbb_eqtl'] = -1*input['panukbb_eqtl_closest_same']
input['finngen_eqtl'] = input['finngen_eqtl_closest_opposite'].copy()
input.loc[input['finngen_eqtl_closest_same'] > input['finngen_eqtl_closest_opposite'], 'finngen_eqtl'] = -1*input['finngen_eqtl_closest_same']
input['mvp_eqtl'] = input['mvp_eqtl_closest_opposite'].copy()
input.loc[input['mvp_eqtl_closest_same'] > input['mvp_eqtl_closest_opposite'], 'mvp_eqtl'] = -1*input['mvp_eqtl_closest_same']

constraint = pd.read_pickle('./Features/constraint.pkl')[['gene','lof.oe_ci.upper_bin_decile']]
input = input.merge(constraint, on='gene', how='left')

ns = pd.read_excel('./Phenotyping/nonspecific_codes.xlsx')[['Code','Nonspecific']].fillna(0)
input = input.merge(ns)
input = input.drop_duplicates(['Code','gene'])

raresv = ['genebass_sv_hclof', 'genebass_sv_lclof', 'genebass_sv_lof_0.5missense','genebass_sv_lof_missense',
          'genebass_sv_gof_0.5missense', 'genebass_sv_gof_missense', 'genebass_sv_hcgof',
          'finngen_sv_hclof', 'finngen_sv_lclof','finngen_sv_gof',
          'genebass_sv_other_0.5missense', 'genebass_sv_other_missense', 'finngen_sv_other_0.5missense', 'finngen_sv_other_missense']
eqtl = ['l2g_activator', 'l2g_inhibitor',
         'panukbb_eqtl', 'finngen_eqtl', 'mvp_eqtl']
gb = ['genebass_gb_hclof_missense', 'genebass_gb_hclof','genebass_gb_hclof_lclof_missense',
      'jurgens_lof', 'jurgens_lof_0.5missense', 'jurgens_lof_0.8missense','jurgens_0.2missense', 'jurgens_0.5missense',
      'finngen_gb_lof']
otp_unique = ['eva_hclof', 'eva_lclof', 'eva_gof', 'impc_lof']
target = ['lof.oe_ci.upper_bin_decile','Category', 'Nonspecific']

input['sources'] = (input[eqtl+raresv+gb+otp_unique].abs() > 0.001).sum(axis=1)
input.to_pickle('./Training/input_extended_all.pkl')


In [None]:
# Only rows with Mantis-ML or OT support
mantis = pd.read_pickle('./Other/mantis_cleaned.pkl')
mantis['ID'] = mantis['Code'] + '|' + mantis['Gene']
mantis = mantis[['ID','Mantis']].drop_duplicates()
da = pd.read_pickle('./OT/Final/direct_associations.pkl')
da['ID'] = da['Code'] + '|' + da['gene']
da = da[['ID','score']].drop_duplicates().rename({'score':'OT'},axis=1)
evidence = mantis.loc[mantis['Mantis'] > 0.5]['ID'].to_list() + da.loc[da['OT'] > 0.1]['ID'].to_list()

input['ID'] = input['Code'] + '|' + input['gene']
input = input.loc[(input['ID'].isin(evidence)) & (input['sources'] > 0)]
input.to_pickle('./Training/input_extended_all_filtered.pkl')
