In [28]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy.stats
import statsmodels.stats.multicomp as mc

In [29]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/slutskin_2019/'

In [56]:
models = {'MLM':'SVR_K562/MLM/', 'Word2Vec':'SVR_K562/word2vec/', 'Slutskin et al.': 'PolyApredictors/',
          '4-mer':'SVR_K562/4mers/','5-mer':'SVR_K562/5mers/','6-mer':'SVR_K562/6mers/'}

df = pd.read_csv(data_dir + 'SVR_K562/MLM/all_predictions.tsv', sep='\t')
IDs = df[df.Fold=='Test'].ID

In [53]:
models = {'MLM':'SVR/MLM/', 'Word2Vec':'SVR/word2vec/', 'Slutskin et al.': 'PolyApredictors/',
          '4-mer':'SVR/4mers/','5-mer':'SVR/5mers/','6-mer':'SVR/6mers/'}

df = pd.read_csv(data_dir + 'SVR/MLM/all_predictions.tsv', sep='\t')
IDs = df[df.Fold=='Test'].ID

In [46]:
preds_mpra = []

for model, path in models.items():
    df = pd.read_csv(data_dir + path + 'all_predictions.tsv', sep='\t')
    df = df.set_index('ID').loc[IDs]
    df['model'] = model
    preds_mpra.append(df[['model','Expression','y_pred']])
    
preds_mpra = pd.concat(preds_mpra)

In [47]:
preds_mpra['error'] = (preds_mpra.y_pred-preds_mpra.Expression)**2 #squared residuals

In [48]:
preds_mpra.groupby('model').error.mean()

model
4-mer              6.126599
5-mer              6.668723
6-mer              7.243197
MLM                6.640968
Slutskin et al.    5.108866
Word2Vec           6.399019
Name: error, dtype: float64

In [49]:
comp1 = mc.MultiComparison(preds_mpra['error'], preds_mpra['model'])
tbl, a1, a2 = comp1.allpairtest(scipy.stats.wilcoxon, method= "bonf")

tbl

group1,group2,stat,pval,pval_corr,reject
4-mer,5-mer,63276.0,0.0319,0.4779,False
4-mer,6-mer,56982.0,0.0001,0.0013,True
4-mer,MLM,70215.0,0.8493,1.0,False
4-mer,Slutskin et al.,69333.0,0.6609,1.0,False
4-mer,Word2Vec,70120.0,0.8284,1.0,False
5-mer,6-mer,61506.0,0.0082,0.1225,False
5-mer,MLM,64538.0,0.0734,1.0,False
5-mer,Slutskin et al.,63129.0,0.0287,0.4305,False
5-mer,Word2Vec,65036.0,0.0989,1.0,False
6-mer,MLM,59808.0,0.0018,0.0268,True


In [50]:
preds_mpra.groupby('model').apply(lambda x:scipy.stats.pearsonr(x.Expression,x.y_pred)[0])

model
4-mer              0.815425
5-mer              0.797158
6-mer              0.777396
MLM                0.799936
Slutskin et al.    0.860808
Word2Vec           0.806947
dtype: float64

In [44]:
preds_mpra.groupby('model').apply(lambda x:scipy.stats.pearsonr(x.Expression,x.y_pred)[0])

model
4-mer              0.718848
5-mer              0.727535
6-mer              0.686466
MLM                0.710272
Slutskin et al.    0.829661
Word2Vec           0.720999
dtype: float64