In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

rd = pd.read_excel('./Resources/combined_phecodex_list.xlsx')
cd = rd.loc[rd['Type'] == 'Common']

# Optional plotting functions
if True:
    import re
    import os
    import matplotlib
    import matplotlib.font_manager as font_manager

    font_path = '/sc/arion/projects/GENECAD/Robert/arial.ttf'
    assert os.path.exists(font_path)
    font_manager.fontManager.addfont(font_path)
    prop = font_manager.FontProperties(fname=font_path)

    matplotlib.rc('font', family='sans-serif')
    matplotlib.rcParams.update({
        'font.size': 12,
        'font.sans-serif': prop.get_name(),
        'axes.titlesize': 12
    })


In [3]:
alldata = pd.read_pickle('./Final/ml_input.pkl')

clin = ['clingen','eva','gene2phenotype','genomics_england','orphanet','uniprot_literature', 'uniprot_variants']
ot = ['clin_ot','hgmd','gwas_credible_sets','expression_atlas','impc','europepmc']
mantis = ['mantis']
cc = ['cc_common_max_p','cc_rare_max_p','cc_rare_burden_max_p','cc_ultrarare_max_p']


In [4]:
# Additional labels for dataset (optional) 

truemoa = alldata[['gene','phecode','moa']].dropna()
truemoa['moa'] = truemoa['moa'].astype(str)
truemoa.loc[truemoa['moa'].str.contains('Negative'), 'moa_clean'] = 'Inhibition'
truemoa.loc[truemoa['moa'].str.contains('Positive'), 'moa_clean'] = 'Activation'
truemoa.loc[truemoa['moa'].str.contains('Other'), 'moa_clean'] = 'Other or unclear'
truemoa.loc[(truemoa['moa'].str.contains('Positive') & truemoa['moa'].str.contains('Negative')), 'moa_clean'] = 'Other or unclear'
truemoa = truemoa[['gene','phecode','moa_clean']]

dtp = pd.read_pickle('./Final/drug_target_phase.pkl')
names = pd.read_pickle('./Resources/targets_with_names_cleaned.pkl')[['chembl_id','dc_name']].dropna().drop_duplicates('chembl_id').rename({'dc_name':'name'},axis=1)
dtp = dtp.merge(names, how='left')

moa = dtp.groupby(['phecode','gene'])['moa'].value_counts().reset_index()
moa = pd.pivot_table(moa, index=['phecode','gene'], columns='moa', values='count').reset_index()
moa = moa.fillna(0)

moa.loc[(moa['Positive modulator'] > moa['Negative modulator']) & (moa['Positive modulator'] >= moa['Other/unknown']), 'moa'] = 'Activation'
moa.loc[(moa['Negative modulator'] > moa['Positive modulator']) & (moa['Negative modulator'] >= moa['Other/unknown']), 'moa'] = 'Inhibition'
moa.loc[moa['Positive modulator'] == moa['Negative modulator'], 'moa'] = 'Conflicting'
moa.loc[(moa['Positive modulator'] == 0) & (moa['Negative modulator'] == 0), 'moa'] = 'Other or unknown'
moa.loc[(moa['moa'].isna()) & (moa['Negative modulator'] > 1), 'moa'] = 'Inhibition' 
moa.loc[(moa['moa'].isna()) & (moa['Positive modulator'] > 1), 'moa'] = 'Activation' 
moa['moa'] = moa['moa'].fillna('Other or unknown')
moa = moa[['phecode','gene','moa']].drop_duplicates()

ind = dtp[['gene','phecode','phase','name']].drop_duplicates()
ind = ind.groupby(['gene','phecode','name'])['phase'].max().reset_index()
ind['phase'] = ind['phase'].map({0.5:'preclinical',1:'I',2:'II',3:'III',4:'IV'})
ind['label'] = ind['name'] + ' (' + ind['phase'].astype(str) + ')'
ind = ind.groupby(['gene','phecode'])['label'].unique().reset_index()
ind['label'] = ind['label'].astype(str).str.replace(" 'Unknown'","").str.replace("' '",", ").str.replace("'","").str.replace("[","").str.replace("]","")

tn = pd.read_pickle('./Resources/targets_with_names_cleaned.pkl').sort_values(['gene','dc_name'])
tn = tn.groupby(['gene','moa'])['dc_name'].unique().reset_index()
tn['dc_name'] = tn['dc_name'].astype(str)
tn['dc_name'] = tn['dc_name'].str.replace("' '",", ",regex=False).str.replace("['","",regex=False).str.replace("']","",regex=False)
tn = pd.pivot(tn, index='gene', columns='moa', values='dc_name').reset_index()


In [5]:
pred = pd.read_pickle('./GPS/Main/Predictions/apc_predictions_ot_mantis_cc.pkl').sort_values('prediction', ascending=False)
pred = pred.merge(rd[['phecode','phecode_string']])
pred = pred.merge(alldata[['id','drug_gene','Tclin']])
pred = pred.merge(moa, how='left')
pred = pred.merge(tn, on='gene', how='left')
pred = pred.merge(ind, how='left')
pred.loc[(pred['indication'] == 1) & (pred['label'].isna()), 'label'] = 'unknown'

pred['Positive modulator'] = pred['Positive modulator'].str.replace("' '",", ",regex=False).str.replace("'\n '",", ",regex=False)
pred['Negative modulator'] = pred['Negative modulator'].str.replace("' '",", ",regex=False).str.replace("'\n '",", ",regex=False)
pred['Other'] = pred['Other'].str.replace("' '",", ",regex=False).str.replace("'\n '",", ",regex=False)


In [6]:
main = pred[['gene','phecode','phecode_string','prediction','indication','phase',
         'moa','label',
         'drug_gene','Tclin',
         'Positive modulator','Negative modulator','Other']]
main['phecode'] = main['phecode'] + " [" + main['phecode_string'] + ']'
main = main.drop('phecode_string',axis=1)

scores = main.set_axis(['Gene','Phecode','RareGPS','Indication','Max phase',
                      'Indicated mechanism','Indicated drugs',
                      'Druggable gene','DrugnomeAI',
                      'Activators','Inhibitors','Other drugs'],axis=1)

scores_top = scores.loc[scores['RareGPS'] >= 0.036805]

scores[['RareGPS','DrugnomeAI']] = scores[['RareGPS','DrugnomeAI']].round(3)
scores_top[['RareGPS','DrugnomeAI']] = scores_top[['RareGPS','DrugnomeAI']].round(3)

scores.to_csv('./Public/RareGPS_all.csv', index=False)
scores_top.to_csv('./Public/RareGPS_top.csv', index=False)


In [7]:
evidence = alldata[['gene','phecode','expression_atlas','hgmd','gwas_credible_sets','impc',
                    'clin_ot','europepmc','cc_common_max_p','cc_rare_max_p','cc_rare_burden_max_p',
                    'cc_ultrarare_max_p','mantis']]

evidence = pred[['gene','phecode','phecode_string','prediction','indication','phase']].merge(evidence, on=['gene','phecode'])
evidence['phecode'] = evidence['phecode'] + " [" + evidence['phecode_string'] + ']'
evidence = evidence.drop('phecode_string',axis=1)

evidence['categories'] = 0
evidence.loc[evidence[['clin_ot','hgmd']].max(axis=1) > 0, 'categories'] += 1
evidence.loc[(evidence[['cc_common_max_p','cc_rare_max_p',
                       'cc_rare_burden_max_p','cc_ultrarare_max_p']].max(axis=1) > 1.301) | (evidence['gwas_credible_sets'] > 0), 'categories'] += 1
evidence.loc[evidence[['europepmc']].max(axis=1) > 0, 'categories'] += 1
evidence.loc[evidence[['mantis']].max(axis=1) > 0, 'categories'] += 1
evidence.loc[evidence[['impc']].max(axis=1) > 0, 'categories'] += 1
evidence.loc[evidence[['expression_atlas']].max(axis=1) > 0, 'categories'] += 1

evidence = evidence.set_axis(['Gene','Phecode','RareGPS','Indication','Max phase','Gene expression',
                              'HGMD','L2G','Mouse models','OTP clinical genetics','Text mining',
                              'Common variants','Rare variants','Rare variants (gene-level)','Ultrarare variants (gene-level)',
                              'Mantis-ML','Sources'],axis=1)
evidence[['RareGPS','Gene expression','HGMD','L2G','Mouse models','OTP clinical genetics','Text mining',
          'Common variants','Rare variants','Rare variants (gene-level)','Ultrarare variants (gene-level)','Mantis-ML']] = evidence[['RareGPS','Gene expression','HGMD','L2G','Mouse models','OTP clinical genetics','Text mining',
          'Common variants','Rare variants','Rare variants (gene-level)','Ultrarare variants (gene-level)','Mantis-ML']].round(3)

evidence.to_csv('./Public/RareGPS_evidence.csv', index=False)

evidence_top = evidence.loc[evidence['RareGPS'] >= 0.036805]
evidence.to_csv('./Public/RareGPS_top_evidence.csv', index=False)


In [8]:
evidence['Sources'].value_counts()

Sources
1    2004186
2     517550
0     453034
3      41803
4       4694
5        658
6         40
Name: count, dtype: int64