In [1]:
import numpy as np
import pandas as pd
import sys
import os

from sklearn.metrics import roc_auc_score

import scipy.stats

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/'

# Compute AUC

for each split, compute AUC and get confidence intervals via bootstrapping

In [16]:
utr_variants = pd.read_csv(data_dir + 'perbase_pred/model_scores_snp.tsv', sep='\t')

utr_variants.groupby('split').label.value_counts()

split    label
clinvar  1.0        215
         0.0        215
eQTL     1.0       2196
         0.0       2196
gnomAD   0.0      10000
         1.0      10000
Name: count, dtype: int64

In [17]:
split = 'clinvar' # clinvar, gnomAD or eQTL

#split = sys.argv[1]
#output_name = sys.argv[2]

In [18]:
utr_variants = utr_variants[utr_variants.split==split]

In [19]:
models = ('Species-aware','Species-agnostic','DNABERT','NT-MS-v2-500M','13-mer','PhyloP100','PhyloP240')

In [30]:
from scipy.stats import bootstrap

def bootstrap_auc(score):

    y_true = utr_variants.label[~score.isna()].values
    y_pred = score[~score.isna()].values
    
    bs = bootstrap((y_true, y_pred),statistic=lambda x,y:roc_auc_score(x,y),
                   vectorized=False, paired=True,n_resamples=100)

    auc = roc_auc_score(y_true,y_pred)
    auc_err = np.diff(bs.confidence_interval)/2
    return auc, auc_err[0]

roc_df = []

for model_name in models:

    print(model_name)

    print('Pref AUC')
    
    ref_auc = bootstrap_auc(utr_variants[f'{model_name}_ref'])
    
    if not 'PhyloP' in model_name:

        print('P alt inv AUC')
        inv_alt_auc = bootstrap_auc(np.log(1/utr_variants[f'{model_name}_alt']))
        print('P ratio AUC')
        ratio_auc = bootstrap_auc(np.log(utr_variants[f'{model_name}_ref']/utr_variants[f'{model_name}_alt']))

    else:
        
        inv_alt_auc = None
        ratio_auc = None

    roc_df.append((model_name, ref_auc, inv_alt_auc, ratio_auc))

Species-aware
Pref AUC
P alt inv AUC
P ratio AUC
Species-agnostic
Pref AUC
P alt inv AUC
P ratio AUC
DNABERT
Pref AUC
P alt inv AUC
P ratio AUC
NT-MS-v2-500M
Pref AUC
P alt inv AUC
P ratio AUC
13-mer
Pref AUC
P alt inv AUC
P ratio AUC
PhyloP100
Pref AUC
PhyloP240
Pref AUC


In [31]:
roc_df = pd.DataFrame(roc_df,columns=['model','ref_auc','inv_alt_auc','ratio_auc'])

In [22]:
#roc_df.to_csv(output_name, index=False)

# Make auc table

In [71]:
models = ['DNABERT', 'NT-MS-v2-500M','13-mer', 'Species-agnostic','Species-aware', 'PhyloP100','PhyloP240']

In [72]:
roc_df = []

for split in ('clinvar','gnomAD','eQTL'):
    df = pd.read_csv(data_dir + f'perbase_pred/auc/snps/{split}.tsv', sep='\t')
    df['split']=split
    df = df.set_index('model').loc[models].reset_index()
    roc_df.append(df)
    
roc_df = pd.concat(roc_df)

roc_df = roc_df.map(lambda x: '(0,0)' if type(x)!=str else x)

In [73]:
roc_df.set_index(['split','model'], inplace=True)
roc_df = roc_df.map(lambda x: (float(x.split(',')[0].lstrip('(')),
                               float(x.split(',')[1].rstrip(')'))))

In [74]:
N_models = len(models)

def is_overlap(a, b):
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))>0

best_models = {}

for split in ('clinvar','gnomAD','eQTL'):
    
    split_data = roc_df.loc[split]
    
    best_auc, best_auc_err =  sorted(split_data.values.flatten())[-1]

    best_models[split] = []

    for model in split_data.index:
        for score in split_data.columns:
            auc, auc_err = split_data.loc[model,score]
            #print (split,model,score,auc,auc_err,'T')
            if auc-auc_err>0.5 and is_overlap((best_auc-best_auc_err,best_auc+best_auc_err),(auc-auc_err,auc+auc_err)):
                #print (split,model,score,auc,auc_err)
                best_models[split].append((model,score))

def highlight_ns(x,best_models):
    #make the best model and models with insignificant difference with the best model bold
    split, model = x.name
    return ['font-weight: bold' if (model,score) in best_models[split] else ''
                for score in x.index]

In [75]:
df = roc_df.copy()

for col in df.columns:
    df[col] = df[col].apply(lambda x: f'{x[0]:.3f}±{x[1]:.3f}' if x!=(0,0) else '-')

#df = df.rename(columns={'ref_auc':'$P_{ref}$','inv_alt_auc':'$P^{-1}_{alt}$','ratio_auc':'$P_{ref}/P_{alt}$'})

In [76]:
df.style.apply(lambda x: highlight_ns(x, best_models),axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ref_auc,inv_alt_auc,ratio_auc
split,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clinvar,DNABERT,0.483±0.036,0.552±0.033,0.514±0.038
clinvar,NT-MS-v2-500M,0.489±0.056,0.537±0.055,0.522±0.054
clinvar,13-mer,0.501±0.040,0.549±0.036,0.524±0.039
clinvar,Species-agnostic,0.490±0.039,0.565±0.037,0.520±0.038
clinvar,Species-aware,0.505±0.037,0.589±0.035,0.544±0.038
clinvar,PhyloP100,0.911±0.024,-,-
clinvar,PhyloP240,0.777±0.045,-,-
gnomAD,DNABERT,0.582±0.008,0.578±0.008,0.594±0.008
gnomAD,NT-MS-v2-500M,0.517±0.016,0.548±0.016,0.541±0.016
gnomAD,13-mer,0.579±0.008,0.567±0.008,0.588±0.008
