In [18]:
import numpy as np
import pandas as pd
import os
import sys

sys.path.append("/home/icb/sergey.vilov/workspace/MLM/mpra/utils") 

from misc import pearson_r, get_best_models, highlight_ns 

In [19]:
regressor = 'SVR' # Ridge, SVR or MLP

In [20]:
data_dir = f'/lustre/groups/epigenereg01/workspace/projects/vale/mlm/mpra/griesemer_2021/predictions/{regressor}/'

In [21]:
models = {'DNABERT': 'dnabert', 
          'DNABERT-3UTR': 'dnabert-3utr', 
          'DNABERT-2': 'dnabert2', 
          'DNABERT2-3UTR': 'dnabert2-3utr',
          'NTv2-250M': 'ntrans-v2-250m',
          'NTv2-250M-3UTR': 'ntrans-v2-250m-3utr',
          'StateSpace': 'stspace', 
          'StateSpace-SA': 'stspace-spaw',
          'Griesemer et al., 2021': 'griesemer',
         }

cell_types = ['HEK293FT', 'HMEC', 'HEPG2', 'GM12878', 'K562', 'SKNSH']

In [22]:
res = {}

for cell_type in cell_types:
        res[cell_type] = []
        for model in models:
            res_tsv = data_dir + f'{cell_type}-{models[model]}.tsv'
            if os.path.isfile(res_tsv):
                df = pd.read_csv(res_tsv, sep='\t', skiprows=1, usecols=[84,85],names=['y_true','y_pred'])
                df['model'] = model
                res[cell_type].append(df)
        if len(res[cell_type])>0:
            res[cell_type] = pd.concat(res[cell_type])
            print(cell_type, int(res[cell_type].groupby('model').size().mean()))

HEK293FT 15968
HMEC 15970
HEPG2 15970
GM12878 15970
K562 15970
SKNSH 15970


In [23]:
preds_res = {}
best_models = {}

for cell_type in cell_types:

        preds_res[cell_type] = res[cell_type].groupby('model').apply(
            lambda x: pearson_r(x.y_true,x.y_pred,compute_CI=True))
            
        best_models[cell_type] = get_best_models(preds_res[cell_type])
        
preds_res = pd.DataFrame(preds_res).map(lambda x: f'{x[0]:.2f}±{x[1]:.2f}' if isinstance(x,tuple) else 'none')

preds_res.loc[models.keys()].style.apply(lambda x: highlight_ns(x, best_models))

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DNABERT,0.24±0.01,0.44±0.01,0.27±0.01,0.36±0.01,0.29±0.01,0.28±0.01
DNABERT-3UTR,0.36±0.01,0.56±0.01,0.48±0.01,0.47±0.01,0.33±0.01,0.37±0.01
DNABERT-2,0.16±0.02,0.37±0.01,0.27±0.01,0.28±0.01,0.23±0.01,0.21±0.01
DNABERT2-3UTR,0.17±0.02,0.43±0.01,0.32±0.01,0.32±0.01,0.26±0.01,0.23±0.01
NTv2-250M,0.19±0.01,0.37±0.01,0.26±0.01,0.26±0.01,0.24±0.01,0.21±0.01
NTv2-250M-3UTR,0.30±0.01,0.51±0.01,0.42±0.01,0.43±0.01,0.35±0.01,0.33±0.01
StateSpace,0.33±0.01,0.52±0.01,0.44±0.01,0.45±0.01,0.35±0.01,0.34±0.01
StateSpace-SA,0.31±0.01,0.53±0.01,0.44±0.01,0.45±0.01,0.35±0.01,0.35±0.01
"Griesemer et al., 2021",0.34±0.01,0.53±0.01,0.43±0.01,0.45±0.01,0.33±0.01,0.35±0.01
