In [1]:
import numpy as np
import pandas as pd
import scipy

In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/griesemer/SVR/activity_pred/'

In [3]:
cells = ['HEK293FT', 'HMEC', 'HEPG2', 'GM12878', 'K562', 'SKNSH']
models = ['MLM','4mers','word2vec','griesemer']

In [4]:
#collect results for all cells and all models

N_rounds = 300 #limit the number of train test splits

all_res = []

for cell_type in cells:
    for model in models:
        if not '5mers' in model:
            model_dir = data_dir + cell_type + '/' + model + '/'
            df = pd.read_csv(model_dir + 'cv_scores.tsv', sep='\t')
            df['model'] = model
            df['cell'] = cell_type
            all_res.append(df.iloc[:N_rounds])

all_res = pd.concat(all_res)

In [8]:
score_df = all_res[['cell','model','round','r2']].set_index(['cell','model']).sort_index()

In [9]:
#get p-value for each pair of models for each cell type
#use paired t-test with correction for repeated CV (C. Nadeau and Y. Bengio. Inference for the generalization error. In Machine Learning 52:239–281, 2003)

test_train_ratio = 0.11 #ratio between test and train counts in each split

sign_tests = {}

for cell in cells:
    cell_res = np.zeros((len(models),len(models)))
    for model1_idx,model1 in enumerate(models):
        for model2_idx,model2 in enumerate(models):
            if model1!=model2:
                score1 = score_df.loc[(cell, model1)].set_index('round').r2 #scores for model1
                score2 = score_df.loc[(cell, model2)].set_index('round').r2 #scores for model2
                diff_score = score1-score2
                t_stat = diff_score.mean()/np.sqrt((1/N_rounds + test_train_ratio)*diff_score.var())
                pval = scipy.stats.t.sf(np.abs(t_stat), N_rounds-1)*2  # two-sided pvalue
            else:
                pval = np.NaN
            cell_res[model1_idx,model2_idx] = pval
    sign_tests[cell] = pd.DataFrame(cell_res, columns=models,index=models)
    
sign_tests = pd.concat(sign_tests)

In [10]:
sign_tests.applymap(lambda x:'-' if np.isnan(x) else f'{x:.3f}' if x>1e-3 else '<1e-3')

Unnamed: 0,Unnamed: 1,MLM,4mers,word2vec,griesemer
HEK293FT,MLM,-,0.002,0.002,0.136
HEK293FT,4mers,0.002,-,0.165,0.035
HEK293FT,word2vec,0.002,0.165,-,0.114
HEK293FT,griesemer,0.136,0.035,0.114,-
HMEC,MLM,-,<1e-3,0.007,<1e-3
HMEC,4mers,<1e-3,-,0.004,0.053
HMEC,word2vec,0.007,0.004,-,0.663
HMEC,griesemer,<1e-3,0.053,0.663,-
HEPG2,MLM,-,<1e-3,<1e-3,<1e-3
HEPG2,4mers,<1e-3,-,<1e-3,<1e-3


In [11]:
res_mean = [] #mean r2 for each model
res_std = [] #std for each model

for cell in cells:
    cell_df = score_df.loc[cell].r2.reset_index()
    res_mean.append(cell_df.groupby('model').mean().rename(columns={'r2':cell}))
    res_std.append(cell_df.groupby('model').std().rename(columns={'r2':cell}))

res_mean = pd.concat(res_mean,axis=1)
res_std = pd.concat(res_std,axis=1)

In [12]:
n_pairs = len(cells)*(len(models)-1) #for each cell the MLM model is compared with all other models
significance_thr = 0.05/n_pairs #bonferroni correction

def highlight_ns(x):
    #make the best model and models with insignificant difference with the best model bold
    best_model = 'MLM'#x.apply(lambda x:float(x.split('±')[0])).idxmax()
    cell = x.name
    sign_results = sign_tests.loc[(cell,best_model)]
    equal_models = list(sign_results[sign_results>significance_thr].index) #models with similar performance
    return ['font-weight: bold' if model==best_model or model in equal_models else ''
                for model in x.index]

res_all = res_mean.applymap(lambda x:f'{x:.2f}') + '±' + res_std.applymap(lambda x:f'{x:.2f}')

res_all.loc[models].style.apply(highlight_ns)

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MLM,0.12±0.08,0.31±0.05,0.24±0.10,0.25±0.09,0.16±0.05,0.15±0.05
4mers,0.05±0.03,0.26±0.05,0.13±0.08,0.15±0.08,0.11±0.05,0.10±0.04
word2vec,0.07±0.06,0.28±0.06,0.18±0.10,0.19±0.09,0.13±0.05,0.12±0.05
griesemer,0.10±0.08,0.28±0.05,0.18±0.09,0.19±0.07,0.12±0.05,0.13±0.05


In [13]:
#bets hyperparameters

best_hpp = all_res[['cell','model','C','gamma','epsilon']].drop_duplicates().sort_values(by=['cell','model'])

best_hpp

Unnamed: 0,cell,model,C,gamma,epsilon
0,GM12878,4mers,64.330355,0.000201,0.368149
0,GM12878,MLM,5.312592,0.004606,0.376697
0,GM12878,griesemer,7.063151,0.004238,0.429876
0,GM12878,word2vec,3.471536,0.003538,0.392607
0,HEK293FT,4mers,1.66757,0.007338,0.038946
0,HEK293FT,MLM,17.941839,0.000576,0.308112
0,HEK293FT,griesemer,3.660187,0.002543,0.250479
0,HEK293FT,word2vec,32.343636,0.000811,0.177635
0,HEPG2,4mers,73.922564,0.000188,0.358625
0,HEPG2,MLM,5.06606,0.005798,0.357137
