In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics
import os

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from scipy import stats
import statsmodels.stats.multicomp as mc

from statannotations.Annotator import Annotator

In [2]:
mpl.rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 
                     'axes.titlesize':14, 'axes.labelsize':16}) #default font sizes for plots

# Auxiliary functions

In [28]:
def compute_score(df):
    return scipy.stats.pearsonr(df.y_true,df.y_pred)[0]

def get_best_models(df, alpha=0.05):
    
    error = (df.y_true-df.y_pred)**2 #squared residuals  

    comp1 = mc.MultiComparison(error, df['model'])
    tbl, a1, a2 = comp1.allpairtest(stats.wilcoxon, method= "bonf") #Wilcoxon test with Bonferroni correction

    models_stats = pd.DataFrame(a2).set_index(['group1','group2']) #see if the difference between pairs of models is significant
    
    #we ignore default Bonferroni correction as it is for wrong number of pairs in the study
    models_stats.reject = models_stats.pval<alpha

    per_model_scores = df.groupby('model').apply(compute_score).sort_values(ascending=False) #compute score based on all predictions for a given model

    sorted_models = per_model_scores.keys() #get models ranking list, scoring in descending order

    best_models = [sorted_models[0]] #always include best model

    #loop over the list and add models until the difference between current and subsequent models is significant
    for model_idx in range(len(sorted_models)-1):

        #two keys are possible for a given pair of models
        idx_2models = (sorted_models[model_idx], sorted_models[model_idx+1]) 
        idx_2models_swap = (sorted_models[model_idx+1], sorted_models[model_idx])

        if ((idx_2models in models_stats.index and models_stats.loc[idx_2models].reject==False) 
                or (idx_2models_swap in models_stats.index and models_stats.loc[idx_2models_swap].reject==False)):
                    best_models.append(sorted_models[model_idx+1])
        else:
            #as soon as significant difference is detected, dont add models to the best models list
            break
            
    return best_models

def highlight_ns(x, best_models):
    #make the best model and models with insignificant difference with the best model bold
    cell_type = x.name
    return ['font-weight: bold' if model in best_models[cell_type] else ''
                for model in x.index]

# Collect predictions

In [4]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/griesemer/SVR_LeaveGroupOut/'

In [22]:
models = ['MLM','4mers','word2vec','griesemer']

cell_types = ['HEK293FT', 'HMEC', 'HEPG2', 'GM12878', 'K562', 'SKNSH']

In [23]:
res = {}

for cell_type in cell_types:
        res[cell_type] = []
        for model in models:
            res_tsv = data_dir + f'{cell_type}-{model}.tsv'
            if os.path.isfile(res_tsv):
                df = pd.read_csv(res_tsv, sep='\t', skiprows=1, usecols=[84,85],names=['y_true','y_pred'])
                df['model'] = model
                res[cell_type].append(df)
        if len(res[cell_type])>0:
            res[cell_type] = pd.concat(res[cell_type])

# Expression prediction

In [33]:
preds_res = {}
best_models = {}

n_pairs = len(cell_types)*(len(models)-1) #for each cell the MLM model is compared with all other models
alpha = 0.05/n_pairs #significance level with Bonferroni correction (5 models x 2 cell types x 2 response types)

for cell_type in cell_types:
            
        #per_fold_scores = res[cell_type].groupby(
        #    ['model','chrom']).apply(compute_score).rename('score').reset_index()
        
        #preds_res[cell_type] = per_fold_scores.groupby('model').score.mean()
        
        preds_res[cell_type] = res[cell_type].groupby('model').apply(compute_score)
        best_models[cell_type] = get_best_models(res[cell_type], alpha=alpha)
        
preds_res = pd.DataFrame(preds_res).applymap(lambda x:f'{x:.2f}')

preds_res.loc[models].round(2).style.apply(lambda x: highlight_ns(x, best_models))

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MLM,0.37,0.56,0.5,0.52,0.4,0.4
4mers,0.24,0.52,0.38,0.4,0.34,0.33
word2vec,0.28,0.55,0.45,0.46,0.37,0.36
griesemer,0.35,0.54,0.44,0.46,0.37,0.37


# Differential expression

'To predict the effect of mutations, we simply subtract the18
predicted expression or decay time of the wild-type from the19
predicted expression or decay time of the mutant; we do not20
train and test on the difference data directly.'

see Siegel, David A., et al. "Massively parallel analysis of human 3′ UTRs reveals that AU-rich element length and registration predict mRNA destabilization." G3 12.1 (2022): jkab404.

Rabani, Michal, et al. "A massively parallel reporter assay of 3′ UTR sequences identifies in vivo rules for mRNA degradation." Molecular cell 68.6 (2017): 1083-1094.

In [51]:
preds_res = {}
best_models = {}

n_pairs = len(cell_types)*(len(models)-1) #for each cell the MLM model is compared with all other models
alpha = 0.05/n_pairs #significance level with Bonferroni correction (5 models x 2 cell types x 2 response types)

for cell_type in cell_types:
            
        df = res[cell_type]

        ref_df = df.iloc[0:-1:2].reset_index(drop=True)
        alt_df = df.iloc[1::2].reset_index(drop=True)

        delta_df = ref_df.merge(alt_df[['y_true','y_pred']],left_index=True,right_index=True, suffixes=('_alt','_ref') )

        delta_df['y_true'] = delta_df.y_true_alt - delta_df.y_true_ref
        delta_df['y_pred'] = delta_df.y_pred_alt - delta_df.y_pred_ref

        delta_df = delta_df[~delta_df.y_true.isna()]

        preds_res[cell_type] = delta_df.groupby('model').apply(compute_score)
        best_models[cell_type] = get_best_models(delta_df, alpha=alpha)

preds_res = pd.DataFrame(preds_res).applymap(lambda x:f'{x:.2f}')
preds_res.loc[models].round(2).style.apply(lambda x: highlight_ns(x, best_models))

Unnamed: 0_level_0,HEK293FT,HMEC,HEPG2,GM12878,K562,SKNSH
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MLM,0.08,0.14,0.24,0.2,0.09,0.11
4mers,0.05,0.1,0.12,0.11,0.08,0.09
word2vec,0.09,0.14,0.19,0.17,0.09,0.11
griesemer,0.04,0.12,0.18,0.16,0.07,0.09
