In [None]:
import pandas as pd

# Import datasets

phecodes = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')
phecodes = phecodes.loc[phecodes['include'] == 'Y']['phecode']

sdu = pd.read_csv('./Datasets/sider_input.csv')
sdu = sdu.loc[sdu['phecode'].isin(phecodes)]
sdu['g-p'] = sdu['gene'] + sdu['phecode']
sdu = sdu.sort_values('phase',ascending=False).drop_duplicates('g-p')
st = sdu[['g-p','indication','phase']].rename({'indication':'SIDER indication','phase':'SIDER phase'},axis=1)

otu = pd.read_csv('./Datasets/ot_input.csv')
otu = otu.loc[otu['phecode'].isin(phecodes)]
otu['g-p'] = otu['gene'] + otu['phecode']
otu = otu.sort_values('phase',ascending=False).drop_duplicates('g-p')
ot = otu[['g-p','indication','phase']].rename({'indication':'Open Targets indication','phase':'Open Targets phase'},axis=1)

# Write ML-GPS and ML-GPS DOE predictions

pred = pd.read_pickle('./Outputs/pred.pkl').sort_values('pcb', ascending=False).reset_index(drop=True)
pred = pred.fillna(0)
pred['diff'] = (pred['pcb']-pred['p'])/pred['p']

phecodes = pd.read_excel('./Phecode/phecode_counts_v2.xlsx')
phecodes = phecodes.loc[phecodes['include'] == 'Y'][['phecode','phecode_string']]
pred = pred.merge(phecodes,on='phecode')

pred = pred.merge(st, on='g-p', how='left').merge(ot, on='g-p', how='left')
pred = pred.drop(['g-p','p_none','p_inh','p_act','pcb_none','direct','indirect',\
                  'direct_count','indirect_count','diff'],axis=1)
pred = pred[['gene','phecode','phecode_string','pcb','pcb_act','pcb_inh',\
 'indication', 'phase', 'activator', 'inhibitor',\
 'Open Targets indication','Open Targets phase','SIDER indication','SIDER phase',\
 'direct_score', 'indirect_score']]
pred = pred.set_axis(['Gene','Phecode','Phecode description','ML-GPS','ML-GPS DOE (activator)',\
                     'ML-GPS DOE (inhibitor)','Any indication','Maximum phase','Activator indication',
                     'Inhibitor indication',
                     'Open Targets indication','Open Targets phase','SIDER indication','SIDER phase',
                     'Open Targets direct association score','Open Targets indirect association score'],axis=1)
pred = pred.fillna(0).sort_values('ML-GPS', ascending=False).reset_index(drop=True)

pred.to_csv('./Final predictions/ML-GPS scores.csv', index=False)
pred.sort_values('ML-GPS',ascending=False)[:(int(len(pred)*0.10))].to_csv('./Website/Data/ML-GPS scores (top 10%).csv', index=False)

# Write ML-GPS features

agp = pd.read_csv('./Datasets/all_gp.csv')
agp = agp.merge(phecodes,on='phecode')
agp = agp[['gene','phecode','phecode_string','EVA_count','HGMD_count','OMIM','l2g_score',\
     'p_g_z','p_e_z','p_b_z','b_g_z','b_e_z','b_b_z','c_g_z','c_e_z','c_g_z']]
agp = agp.set_axis(['Gene','Phecode','Phecode description','EVA','HGMD','OMIM','L2G',
                    'P common','P rare','P ultra-rare','B common','B rare','B ultra-rare','C common','C rare','C ultra-rare'],axis=1)
agp = agp.merge(pred[['Gene','Phecode','ML-GPS']], on=['Gene','Phecode'])
agp = agp.sort_values('ML-GPS',ascending=False).reset_index(drop=True)

agp = agp[['Gene','Phecode','Phecode description','ML-GPS','EVA','HGMD','OMIM','L2G',
                    'P common','P rare','P ultra-rare','B common','B rare','B ultra-rare','C common','C rare','C ultra-rare']]

agp.to_csv('./Final predictions/ML-GPS features.csv', index=False)
agp.sort_values('ML-GPS',ascending=False)[:(int(len(pred)*0.10))].to_csv('./Website/Data/ML-GPS features (top 10%).csv', index=False)

# Write ML-GPS DOE features

agp = pd.read_csv('./Datasets/all_gp.csv')
agp = agp.merge(phecodes,on='phecode')
agp = agp[['gene','phecode','phecode_string',
          'EVA_act','EVA_inh','EVA_neu','HGMD_act', 'HGMD_inh', 'HGMD_neu','OMIM',
          'l2g_act','l2g_inh','l2g_neu',
           'p_g_act','p_g_inh','p_e_act','p_e_inh','p_e_neu','p_b_act','p_b_inh',
           'b_g_act','b_g_inh','b_e_act','b_e_inh','b_e_neu','b_b_act','b_b_inh',
           'c_g_act','c_g_inh','c_e_act','c_e_inh','c_e_neu','c_b_act','c_b_inh']]
agp = agp.set_axis(['Gene','Phecode','Phecode description',
                    'EVA (act.)','EVA (inh.)','EVA (neu.)',
                   'HGMD (act.)','HGMD (inh.)','HGMD (neu.)',
                   'OMIM',
                   'L2G (act.)','L2G (inh.)','L2G (neu.)',
                   'P common (act.)','P common (inh.)','P rare (act.)','P rare (inh.)','P rare (neu.)','P ultra-rare (act.)','P ultra-rare (inh.)',
                   'B common (act.)','B common (inh.)','B rare (act.)','B rare (inh.)','B rare (neu.)','B ultra-rare (act.)','B ultra-rare (inh.)',
                   'C common (act.)','C common (inh.)','C rare (act.)','C rare (inh.)','C rare (neu.)','C ultra-rare (act.)','C ultra-rare (inh.)'],axis=1)
agp = agp.merge(pred[['Gene','Phecode','ML-GPS','ML-GPS DOE (activator)','ML-GPS DOE (inhibitor)']],on=['Gene','Phecode'])
agp = agp.sort_values('ML-GPS',ascending=False).reset_index(drop=True)

agp = agp[['Gene','Phecode','Phecode description','ML-GPS','ML-GPS DOE (activator)','ML-GPS DOE (inhibitor)',
                    'EVA (act.)','EVA (inh.)','EVA (neu.)',
                   'HGMD (act.)','HGMD (inh.)','HGMD (neu.)',
                   'OMIM',
                   'L2G (act.)','L2G (inh.)','L2G (neu.)',
                   'P common (act.)','P common (inh.)','P rare (act.)','P rare (inh.)','P rare (neu.)','P ultra-rare (act.)','P ultra-rare (inh.)',
                   'B common (act.)','B common (inh.)','B rare (act.)','B rare (inh.)','B rare (neu.)','B ultra-rare (act.)','B ultra-rare (inh.)',
                   'C common (act.)','C common (inh.)','C rare (act.)','C rare (inh.)','C rare (neu.)','C ultra-rare (act.)','C ultra-rare (inh.)']]

agp.to_csv('./Final predictions/ML-GPS features (DOE).csv', index=False)
agp.sort_values('ML-GPS',ascending=False)[:(int(len(pred)*0.1))].to_csv('./Website/Data/ML-GPS features (top 10%) (DOE).csv', index=False)