## Imports

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, average_precision_score
import xgboost as xgb

cov = ['loeuf'] 
ot = ['clin_ot','hgmd','ot_genetics_portal','expression_atlas','impc','europepmc']
mantis = ['mantis']
cc = ['cc_common_max_p','cc_rare_max_p','cc_rare_burden_max_p','cc_ultrarare_max_p']

## Generate predictions using trained models

Here we use trained models to generate predictions for 120 ultrarare phecodes not included in our training set. 

Because genetic associations are not available for these phecodes, we use the version of RareGPS without genetic associations ("RareGPS_No_GA").

In [33]:
ur = pd.read_pickle('./Examples/ultrarare_input.pkl')
print('G-P pairs',ur['id'].nunique())
print('Genes',ur['gene'].nunique())
print('Phecodes',ur['phecode'].nunique())
print('Indications',ur['indication'].sum())
ur.head()

G-P pairs 167817
Genes 17823
Phecodes 120
Indications 851


Unnamed: 0,id,gene,phecode,indication,phase,loeuf,clin_ot,hgmd,ot_genetics_portal,expression_atlas,impc,europepmc,mantis
155509,GE_976.1:CWC25,CWC25,GE_976.1,0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.2035
109132,BI_164.8:RBM39,RBM39,BI_164.8,0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0674
59619,GE_976.1:CLIC4,CLIC4,GE_976.1,0,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.2652
163932,DE_685.5:ZNF322,ZNF322,DE_685.5,0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.1322
149137,GE_976.1:TRAPPC2L,TRAPPC2L,GE_976.1,0,0.0,1.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0587


Feature columns should be named as above. 

The columns 'indication' and 'phase' are not necessary to generate predictions but are helpful to validate the accuracy of predictions. Please refer to the manuscript for details about how each feature was scored.

RareGPS models were trained using 5-fold cross validation, so 25 predictions will be generated for each G-P pair and averaged to output a single prediction.

In [None]:
mnum = 0 # Select model
model_names = [['RareGPS_No_GA','RareGPS'][mnum]]
feature_names = [[cov+ot+mantis,cov+ot+mantis+cc][mnum]]

for i,j in zip(model_names, feature_names):
    print('Model',i)

    X_ur = ur[j]
    y_ur = ur['indication']
    ids_ur = ur['id']
    
    ur_predictions = []
        
    for fold in range(1,6):
        for inner_fold in range(1,6):

            model = xgb.Booster()
            model.load_model(f"./Trained models/{i}_{fold}_{inner_fold}.json")

            y_pred_ur = model.predict(xgb.DMatrix(X_ur, enable_categorical=True))
            auroc_ur = roc_auc_score(y_ur, y_pred_ur)
            auprc_ur = average_precision_score(y_ur, y_pred_ur)
            print('Fold',fold,inner_fold,'AUROC',round(auroc_ur,2),'AUPRC',round(auprc_ur,2))

            fold_predictions = pd.DataFrame({
                'id': ids_ur,
                'prediction': y_pred_ur
            })
            ur_predictions.append(fold_predictions)

    ur_predictions = pd.concat(ur_predictions, ignore_index=True)
    ur_predictions = ur_predictions.groupby('id')['prediction'].mean().reset_index()
    ur_predictions = ur_predictions.merge(ur[['id','gene','phecode','indication','phase']])
    ur_predictions.to_pickle(f'./Outputs/Predictions/ur_predictions_{i}.pkl')


Model RareGPS_No_GA
Fold 1 1 AUROC 0.82 AUPRC 0.05
Fold 1 2 AUROC 0.83 AUPRC 0.05
Fold 1 3 AUROC 0.82 AUPRC 0.05
Fold 1 4 AUROC 0.83 AUPRC 0.05
Fold 1 5 AUROC 0.83 AUPRC 0.05
Fold 2 1 AUROC 0.83 AUPRC 0.05
Fold 2 2 AUROC 0.82 AUPRC 0.04
Fold 2 3 AUROC 0.83 AUPRC 0.05
Fold 2 4 AUROC 0.83 AUPRC 0.05
Fold 2 5 AUROC 0.83 AUPRC 0.05
Fold 3 1 AUROC 0.83 AUPRC 0.05
Fold 3 2 AUROC 0.82 AUPRC 0.05
Fold 3 3 AUROC 0.82 AUPRC 0.05
Fold 3 4 AUROC 0.82 AUPRC 0.05
Fold 3 5 AUROC 0.83 AUPRC 0.05
Fold 4 1 AUROC 0.83 AUPRC 0.05
Fold 4 2 AUROC 0.83 AUPRC 0.05
Fold 4 3 AUROC 0.83 AUPRC 0.05
Fold 4 4 AUROC 0.82 AUPRC 0.05
Fold 4 5 AUROC 0.83 AUPRC 0.05
Fold 5 1 AUROC 0.83 AUPRC 0.05
Fold 5 2 AUROC 0.83 AUPRC 0.05
Fold 5 3 AUROC 0.83 AUPRC 0.05
Fold 5 4 AUROC 0.82 AUPRC 0.05
Fold 5 5 AUROC 0.83 AUPRC 0.05


Here we see that the predictions achieve high AUROCs and AUPRCs.

In [None]:
ur_predictions = pd.read_pickle('./Outputs/Predictions/ur_predictions_RareGPS_No_GA.pkl')
print('AUROC',round(roc_auc_score(ur_predictions['indication'], ur_predictions['prediction']),4))
print('AUPRC',round(average_precision_score(ur_predictions['indication'], ur_predictions['prediction']),4))
print('Proportion of cases',round(ur_predictions['indication'].mean(),4))

AUROC 0.836
AUPRC 0.0526
Proportion of cases 0.0051


Below are the G-P pairs with highest scores.

In [None]:
ur_predictions = pd.read_pickle('./Outputs/Predictions/ur_predictions_RareGPS_No_GA.pkl')
ur_annotations = pd.read_excel('./Examples/ultrarare_phecode_list.xlsx')
ur_predictions.merge(ur_annotations[['phecode','phecode_string']]).sort_values('prediction',ascending=False).head(10)

Unnamed: 0,id,prediction,gene,phecode,indication,phase,phecode_string
138181,GE_978.1:GHR,0.346009,GHR,GE_978.1,1,0.5,Genetic causes of short stature*
137420,GE_976.4:COL4A4,0.285241,COL4A4,GE_976.4,0,0.0,Alport syndrome*
137419,GE_976.4:COL4A3,0.264794,COL4A3,GE_976.4,0,0.0,Alport syndrome*
18968,BI_169.2:RUNX1,0.252696,RUNX1,BI_169.2,0,0.0,Qualitative platelet defects
137195,GE_976.2:CLCNKB,0.239541,CLCNKB,GE_976.2,0,0.0,Bartter's syndrome
99930,GE_971.12:F9,0.2323,F9,GE_971.12,1,4.0,Hereditary factor IX disorder [Hemophilia B]
98266,GE_970.5:HBB,0.231963,HBB,GE_970.5,1,4.0,Hereditary persistence of fetal hemoglobin [HP...
115200,GE_972.7:TOR1A,0.229842,TOR1A,GE_972.7,0,0.0,Genetic torsion dystonia
79935,GE_965.4:SCARB1,0.222636,SCARB1,GE_965.4,0,0.0,Familial hypercholesterolemia*
159010,GI_522.6:TNF,0.222077,TNF,GI_522.6,1,2.0,Eosinophilic gastroenteritis and colitis
