In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics
import os

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from scipy import stats
import statsmodels.stats.multicomp as mc

from statannotations.Annotator import Annotator

In [2]:
mpl.rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 
                     'axes.titlesize':14, 'axes.labelsize':16}) #default font sizes for plots

# Auxiliary functions

In [3]:
def compute_score(df):
    pearson_r = scipy.stats.pearsonr(df.y_true,df.y_pred)
    ci_95 = pearson_r.confidence_interval()
    ci_95 = np.diff(ci_95)[0]/2
    pearson_r = pearson_r[0]
    return (pearson_r,ci_95)
    
def get_best_models(df):

    def is_overlap(a, b):
        return max(0, min(a[1], b[1]) - max(a[0], b[0]))>0

    best_models = []

    best_auc, best_auc_err =  df.sort_values().iloc[-1]

    for model, (auc, auc_err) in df.items():
            if is_overlap((best_auc-best_auc_err,best_auc+best_auc_err),(auc-auc_err,auc+auc_err)):
                best_models.append(model)

    return best_models

def highlight_ns(x, best_models):
    #make the best model and models with insignificant difference with the best model bold
    cell_type = x.name
    return ['font-weight: bold' if model in best_models[cell_type] else ''
                for model in x.index]

# Collect predictions

In [4]:
regressor = 'Ridge' # Ridge or SVR

In [5]:
data_dir = f'/lustre/groups/epigenereg01/workspace/projects/vale/MLM/griesemer/{regressor}_LeaveGroupOut/'

In [6]:
models = ['DNABERT','DNABERT-2','NT-MS-v2-500M','Species-agnostic','Species-aware','griesemer']

cell_types = ['HEK293FT', 'HMEC', 'HEPG2', 'GM12878', 'K562', 'SKNSH']

In [13]:
res = {}

for cell_type in cell_types:
        res[cell_type] = []
        for model in models:
            res_tsv = data_dir + f'{cell_type}-{model}.tsv'
            if os.path.isfile(res_tsv):
                df = pd.read_csv(res_tsv, sep='\t', skiprows=1, usecols=[84,85],names=['y_true','y_pred'])
                df['model'] = model
                res[cell_type].append(df)
        if len(res[cell_type])>0:
            res[cell_type] = pd.concat(res[cell_type])
            print(cell_type, int(res[cell_type].groupby('model').size().mean()))

HEK293FT 14944
HMEC 14946
HEPG2 14946
GM12878 14946
K562 14946
SKNSH 14946


# Expression prediction

In [44]:
preds_res = {}
best_models = {}


for cell_type in cell_types:

        preds_res[cell_type] = res[cell_type].groupby('model').apply(compute_score)
        best_models[cell_type] = get_best_models(preds_res[cell_type])
        
preds_res = pd.DataFrame(preds_res).map(lambda x: f'{x[0]:.2f}±{x[1]:.2f}')

preds_res.loc[models].style.apply(lambda x: highlight_ns(x, best_models))

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DNABERT,0.08±0.02,0.30±0.01,0.14±0.02,0.13±0.02,0.15±0.02,0.11±0.02
DNABERT-2,0.18±0.02,0.36±0.01,0.30±0.01,0.30±0.01,0.22±0.02,0.20±0.02
NT-MS-v2-500M,0.13±0.02,0.32±0.01,0.19±0.02,0.19±0.02,0.18±0.02,0.16±0.02
Species-agnostic,0.32±0.01,0.49±0.01,0.39±0.01,0.39±0.01,0.34±0.01,0.32±0.01
Species-aware,0.31±0.01,0.49±0.01,0.36±0.01,0.37±0.01,0.30±0.01,0.29±0.01
griesemer,0.22±0.02,0.44±0.01,0.25±0.02,0.25±0.02,0.28±0.01,0.26±0.01
