In [25]:
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics

In [26]:
def nanpearson(y_true, y_pred):
    mask = ~np.isnan(y_pred)
    pearson_r = scipy.stats.pearsonr(y_true[mask],y_pred[mask])[0]
    return pearson_r

def nanr2score(y_true, y_pred):
    mask = ~np.isnan(y_pred)
    r2 = sklearn.metrics.r2_score(y_true[mask],y_pred[mask])
    return r2

In [27]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/griesemer/SVR/activity_pred/'

In [35]:
metric = 'pearson_r' #pearson_r or r2

In [36]:
mpra_df = pd.read_csv('/s/project/mll/sergey/effect_prediction/MLM/griesemer/mpra_df.tsv', sep='\t') #sequence info
is_snp = mpra_df.ref_allele.str.len() == mpra_df.alt_allele.str.len()

In [37]:
cells = ['HEK293FT', 'HMEC', 'HEPG2', 'GM12878', 'K562', 'SKNSH']
models = ['MLM','4mers','word2vec','griesemer']

In [38]:
#collect results for all cells and all models

N_rounds = 300 #limit the number of train test splits

all_scores = []
all_res = {}

for cell_type in cells:
    
    flt = mpra_df[f'log2FoldChange_Skew_{cell_type}'].isna()  | (~is_snp) | (mpra_df.stop_codon_dist>5000) #| mpra_df.oligo_id.str.contains('_ref$')
    y_test = mpra_df[~flt].apply(lambda x: x[f'log2FoldChange_Alt_{cell_type}'] if x.oligo_id.endswith('_alt') else x[f'log2FoldChange_Ref_{cell_type}'], axis=1)   
    y_test = y_test.apply(lambda x:x.replace(',','.') if type(x)==str else x).astype(float)

    for model in models:
        if not '5mers' in model:
            
            model_dir = data_dir + cell_type + '/' + model + '/'
            
            df = pd.read_csv(model_dir + 'cv_scores.tsv', sep='\t')
            
            df['model'] = model
            df['cell'] = cell_type
            
            cv_res = np.load(model_dir + 'cv_res.npy')
                
            df['pearson_r'] = list(map(lambda x: nanpearson(y_test,x), cv_res))
            
            all_scores.append(df.iloc[:N_rounds])
            
            all_res[(model,cell_type)] = cv_res
            
all_scores = pd.concat(all_scores)

In [39]:
score_df = all_scores[['cell','model','round',metric]].set_index(['cell','model']).sort_index()

In [40]:
#get p-value for each pair of models for each cell type
#use paired t-test with correction for repeated CV (C. Nadeau and Y. Bengio. Inference for the generalization error. In Machine Learning 52:239–281, 2003)

test_train_ratio = 0.11 #ratio between test and train counts in each split

sign_tests = {}

for cell in cells:
    cell_res = np.zeros((len(models),len(models)))
    for model1_idx,model1 in enumerate(models):
        for model2_idx,model2 in enumerate(models):
            if model1!=model2:
                score1 = score_df.loc[(cell, model1)].set_index('round')[metric] #scores for model1
                score2 = score_df.loc[(cell, model2)].set_index('round')[metric] #scores for model2
                diff_score = score1-score2
                t_stat = diff_score.mean()/np.sqrt((1/N_rounds + test_train_ratio)*diff_score.var())
                pval = scipy.stats.t.sf(np.abs(t_stat), N_rounds-1)*2  # two-sided pvalue
            else:
                pval = np.NaN
            cell_res[model1_idx,model2_idx] = pval
    sign_tests[cell] = pd.DataFrame(cell_res, columns=models,index=models)
    
sign_tests = pd.concat(sign_tests)

In [41]:
sign_tests.applymap(lambda x:'-' if np.isnan(x) else f'{x:.3f}' if x>1e-3 else '<1e-3')

Unnamed: 0,Unnamed: 1,MLM,4mers,word2vec,griesemer
HEK293FT,MLM,-,<1e-3,0.010,0.112
HEK293FT,4mers,<1e-3,-,0.002,0.009
HEK293FT,word2vec,0.010,0.002,-,0.335
HEK293FT,griesemer,0.112,0.009,0.335,-
HMEC,MLM,-,<1e-3,0.025,<1e-3
HMEC,4mers,<1e-3,-,<1e-3,0.098
HMEC,word2vec,0.025,<1e-3,-,0.181
HMEC,griesemer,<1e-3,0.098,0.181,-
HEPG2,MLM,-,<1e-3,<1e-3,<1e-3
HEPG2,4mers,<1e-3,-,<1e-3,<1e-3


In [42]:
res_mean = [] #mean r2 for each model
res_std = [] #std for each model

for cell in cells:
    cell_df = score_df.loc[cell,metric].reset_index()
    res_mean.append(cell_df.groupby('model').mean().rename(columns={metric:cell}))
    res_std.append(cell_df.groupby('model').std().rename(columns={metric:cell}))

res_mean = pd.concat(res_mean,axis=1)
res_std = pd.concat(res_std,axis=1)

In [43]:
n_pairs = len(cells)*(len(models)-1) #for each cell the MLM model is compared with all other models
significance_thr = 0.05/n_pairs #bonferroni correction

def highlight_ns(x):
    #make the best model and models with insignificant difference with the best model bold
    best_model = 'MLM'#x.apply(lambda x:float(x.split('±')[0])).idxmax()
    cell = x.name
    sign_results = sign_tests.loc[(cell,best_model)]
    equal_models = list(sign_results[sign_results>significance_thr].index) #models with similar performance
    return ['font-weight: bold' if model==best_model or model in equal_models else ''
                for model in x.index]

res_all = res_mean.applymap(lambda x:f'{x:.2f}') + '±' + res_std.applymap(lambda x:f'{x:.2f}')

res_all.loc[models].style.apply(highlight_ns)

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MLM,0.36±0.10,0.56±0.05,0.48±0.11,0.49±0.09,0.40±0.06,0.39±0.06
4mers,0.24±0.05,0.52±0.05,0.36±0.11,0.39±0.09,0.35±0.06,0.32±0.05
word2vec,0.30±0.08,0.54±0.05,0.42±0.11,0.44±0.10,0.37±0.06,0.36±0.06
griesemer,0.33±0.12,0.53±0.05,0.42±0.11,0.43±0.09,0.36±0.06,0.36±0.06


In [44]:
#bets hyperparameters

best_hpp = all_scores[['cell','model','C','gamma','epsilon']].drop_duplicates().sort_values(by=['cell','model'])

best_hpp

Unnamed: 0,cell,model,C,gamma,epsilon
0,GM12878,4mers,64.330355,0.000201,0.368149
0,GM12878,MLM,5.312592,0.004606,0.376697
0,GM12878,griesemer,7.063151,0.004238,0.429876
0,GM12878,word2vec,3.471536,0.003538,0.392607
0,HEK293FT,4mers,1.66757,0.007338,0.038946
0,HEK293FT,MLM,17.941839,0.000576,0.308112
0,HEK293FT,griesemer,3.660187,0.002543,0.250479
0,HEK293FT,word2vec,32.343636,0.000811,0.177635
0,HEPG2,4mers,73.922564,0.000188,0.358625
0,HEPG2,MLM,5.06606,0.005798,0.357137


# Differential expression

'To predict the effect of mutations, we simply subtract the18
predicted expression or decay time of the wild-type from the19
predicted expression or decay time of the mutant; we do not20
train and test on the difference data directly.'

see Siegel, David A., et al. "Massively parallel analysis of human 3′ UTRs reveals that AU-rich element length and registration predict mRNA destabilization." G3 12.1 (2022): jkab404.

Rabani, Michal, et al. "A massively parallel reporter assay of 3′ UTR sequences identifies in vivo rules for mRNA degradation." Molecular cell 68.6 (2017): 1083-1094.

In [45]:
difference_r = np.zeros((len(models),len(cells)))

for cell_idx, cell_type in enumerate(cells):
    flt = mpra_df[f'log2FoldChange_Skew_{cell_type}'].isna()  | (~is_snp) | (mpra_df.stop_codon_dist>5000) #| mpra_df.oligo_id.str.contains('_ref$')
    y_test_skew = mpra_df.loc[~flt, f'log2FoldChange_Skew_{cell_type}']
    y_test_skew = y_test_skew.iloc[::2] #take every other as identical for ref and alt
    y_test_skew = y_test_skew.apply(lambda x:x.replace(',','.') if type(x)==str else x).astype(float)
    for model_idx, model in enumerate(models):
        y_pred = all_res[(model, cell_type)]
        assert all((np.isnan(y_pred[:,0::2])==np.isnan(y_pred[:,1::2])).flatten()) #if ref is nan, then alt should also be nan and vice versa
        y_pred_skew = y_pred[:,0::2]-y_pred[:,1::2] #alt-ref expression
        if metric == 'pearson_r':
            cv_score  = list(map(lambda x: nanpearson(y_test_skew,x), y_pred_skew)) #pearson r for all splits
        else:
            cv_score  = list(map(lambda x: nanr2score(y_test_skew,x), y_pred_skew)) #pearson r for all splits
        difference_r[model_idx,cell_idx] = np.mean(cv_score)

In [46]:
pd.DataFrame(difference_r, index=models, columns=cells).round(3)

Unnamed: 0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
MLM,0.079,0.148,0.229,0.2,0.097,0.109
4mers,0.057,0.101,0.118,0.113,0.088,0.086
word2vec,0.086,0.137,0.179,0.167,0.089,0.112
griesemer,0.056,0.119,0.171,0.147,0.068,0.091
