In [1]:
import numpy as np
import pandas as pd
import os

import sys
sys.path.append("/home/icb/sergey.vilov/workspace/MLM/mpra/utils") 

from misc import pearson_r, get_best_models, highlight_ns 

In [2]:
models = {'DNABERT': 'dnabert', 
          'DNABERT-3UTR': 'dnabert-3utr', 
          'DNABERT-2': 'dnabert2', 
          'DNABERT2-3UTR': 'dnabert2-3utr',
          'NTv2-250M': 'ntrans-v2-250m',
          'NTv2-250M-3UTR': 'ntrans-v2-250m-3utr',
          'StateSpace': 'stspace', 
          'StateSpace-SA': 'stspace-spaw',
          '5-mers Siegel et al., 2022':'5mers',
         }

In [3]:
regressor = 'SVR' # Ridge, SVR or MLP
onlyref = 0

In [4]:
data_dir = f'/lustre/groups/epigenereg01/workspace/projects/vale/mlm/mpra/siegel_2022/predictions/onlyref_{onlyref}/{regressor}/'

In [5]:
res = {}

for response in ('stability', 'steady_state'):
    for cell_type in ('Jurkat', 'Beas2B'):
        res[(response,cell_type)] = []
        for model in models:
            res_tsv = data_dir + f'{cell_type}-{response}-{models[model]}.tsv'
            if os.path.isfile(res_tsv):
                #df = pd.read_csv(res_tsv, sep='\t', skiprows=1, usecols=[2,7,8,36,38,39],names=['ids','iscontrol','parent_control_oligo','y_true','chrom','y_pred'])
                df = pd.read_csv(res_tsv, sep='\t', skiprows=1, usecols=[2,7,8,37,38,39],names=['ids','iscontrol','parent_control_oligo','y_true','chrom','y_pred'])
                df['model'] = model
                res[(response,cell_type)].append(df)
        if len(res[(response,cell_type)])>0:
            res[(response,cell_type)] = pd.concat(res[(response,cell_type)])
            N=res[(response,cell_type)].groupby('model').size().mean()
            print(response,cell_type,int(N))

stability Jurkat 10817
stability Beas2B 2949
steady_state Jurkat 12072
steady_state Beas2B 6427


In [6]:
preds_res = {}
best_models = {}


for cell_type in ('Jurkat', 'Beas2B'):
    
    for response in ('steady_state','stability'):
        
        preds_res[(cell_type,response)] = res[(response,cell_type)].groupby('model').apply(lambda x:pearson_r(x.y_true,x.y_pred,compute_CI=True))
        best_models[(cell_type,response)] = get_best_models(preds_res[(cell_type,response)])
        
preds_res = pd.DataFrame(preds_res).map(lambda x: f'{x[0]:.2f}±{x[1]:.2f}' if isinstance(x,tuple) else 'none')

preds_res.loc[models.keys()].style.apply(lambda x: highlight_ns(x, best_models))

Unnamed: 0_level_0,Jurkat,Jurkat,Beas2B,Beas2B
Unnamed: 0_level_1,steady_state,stability,steady_state,stability
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
DNABERT,0.15±0.02,0.30±0.02,0.22±0.03,0.40±0.03
DNABERT-3UTR,0.22±0.02,0.46±0.02,0.27±0.03,0.52±0.03
DNABERT-2,0.15±0.02,0.27±0.02,0.15±0.03,0.34±0.04
DNABERT2-3UTR,0.14±0.02,0.29±0.02,0.22±0.03,0.41±0.03
NTv2-250M,0.09±0.02,0.23±0.02,0.28±0.02,0.34±0.03
NTv2-250M-3UTR,0.13±0.02,0.32±0.02,0.23±0.02,0.45±0.03
StateSpace,0.17±0.02,0.36±0.02,0.27±0.02,0.49±0.03
StateSpace-SA,0.18±0.02,0.35±0.02,0.28±0.02,0.49±0.03
"5-mers Siegel et al., 2022",0.20±0.02,0.46±0.01,0.14±0.02,0.51±0.02
