## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [1]:
# already set up on Expanse; toggle for colab

# pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] pyfaidx

Set path prefix for Expanse

In [2]:
path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

Import SNP datasets

In [3]:
# set model name

# model name is "REG_CADD_cons", "PAT_CADD_cons", "CAN_CADD_cons", "REG_CADD_cons_default", or "CAN_CADD_cons_default"
model_name = "PAT_CADD_cons_default"

In [4]:
from joblib import dump, load

test_estimator = load('/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_' + model_name + '.joblib')



In [5]:
# Load Rate_Patho dataset

In [6]:
import pandas as pd

path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

RARPAT_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/RARPAT_0_102-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [7]:
RARPAT_abs_diff_plus_log_odds_scores_norm.shape

(102, 1380)

Import conservation scores

In [8]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Rare_Patho_SNV_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(         Name  priPhCons  priPhyloP  GerpN  GerpS
 0  RARPAT0001      0.001     -3.552   4.26 -6.250
 1  RARPAT0002      0.021     -1.482   4.36 -3.380
 2  RARPAT0003      0.000     -3.401   4.09 -3.280
 3  RARPAT0004      0.097      0.255   2.12  0.856
 4  RARPAT0005      0.069     -0.558   2.47 -4.950,
 (102, 5))

In [9]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,RARPAT0001,0.001,-3.552,4.26,-6.2500
1,RARPAT0002,0.021,-1.482,4.36,-3.3800
2,RARPAT0003,0.000,-3.401,4.09,-3.2800
3,RARPAT0004,0.097,0.255,2.12,0.8560
4,RARPAT0005,0.069,-0.558,2.47,-4.9500
...,...,...,...,...,...
97,RARPAT0098,0.956,0.597,5.66,5.6600
98,RARPAT0099,0.916,0.597,5.14,0.0847
99,RARPAT0100,0.029,0.484,4.10,0.2040
100,RARPAT0101,0.001,-0.255,4.26,-4.4600


Drop name column before concatenating

In [10]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [11]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.001,-3.552,4.26,-6.2500
1,0.021,-1.482,4.36,-3.3800
2,0.000,-3.401,4.09,-3.2800
3,0.097,0.255,2.12,0.8560
4,0.069,-0.558,2.47,-4.9500
...,...,...,...,...
97,0.956,0.597,5.66,5.6600
98,0.916,0.597,5.14,0.0847
99,0.029,0.484,4.10,0.2040
100,0.001,-0.255,4.26,-4.4600


Normalize conservation scores

In [12]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [13]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.804896,-5.978133,1.105324,-3.074966
1,-0.702284,-2.482164,1.180027,-1.709069
2,-0.810026,-5.723113,0.978329,-1.661477
3,-0.312361,0.451409,-0.493317,0.306939
4,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...
97,4.094798,1.029004,2.151165,2.593270
98,3.889575,1.029004,1.762710,-0.060140
99,-0.661240,0.838162,0.985800,-0.003363
100,-0.804896,-0.409916,1.105324,-2.223065


Concatenate chromatin feature scores and conservation scores

In [14]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([RARPAT_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [15]:
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.206538,2.906377,-0.170420,0.426654,4.482777,-0.250790,-0.309892,-0.490882,1.445626,-0.223756,...,-0.345971,-0.380637,-0.121216,-0.384348,-0.293211,-0.380950,-0.804896,-5.978133,1.105324,-3.074966
1,-0.199992,-0.375900,-0.181145,0.090514,-0.511342,-0.245346,-0.315248,-0.518109,-0.447039,-0.203248,...,0.005478,-0.285432,-0.564576,-0.375864,-0.276490,-0.429664,-0.702284,-2.482164,1.180027,-1.709069
2,-0.206664,-0.596656,0.064562,-0.834445,-0.511587,-0.200498,-0.315569,-0.146835,-0.445965,-0.220517,...,0.588175,0.127299,-0.620407,-0.369813,1.355054,0.490738,-0.810026,-5.723113,0.978329,-1.661477
3,-0.206959,-0.652848,-0.199506,-1.051992,-0.511260,-0.250757,-0.315690,-0.531651,-0.447218,-0.225224,...,-0.414731,-0.393139,-0.635052,-0.382570,-0.302146,-0.436147,-0.312361,0.451409,-0.493317,0.306939
4,-0.206414,-0.607214,-0.142333,-0.135244,-0.511261,-0.250779,-0.314809,-0.530276,-0.366058,-0.225169,...,-0.283245,-0.315174,1.199907,-0.343330,-0.283350,-0.238857,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.358051,1.170373,0.535112,0.557339,-0.139149,-0.221604,-0.294488,-0.465753,-0.089202,-0.127893,...,-0.301338,-0.008619,-0.463245,0.107149,-0.249296,-0.181547,4.094798,1.029004,2.151165,2.593270
98,0.395013,3.306250,0.176883,0.839464,0.098059,-0.159920,-0.292982,-0.471717,0.924219,-0.163577,...,-0.416086,-0.071533,-0.495524,-0.187121,-0.251338,-0.333898,3.889575,1.029004,1.762710,-0.060140
99,1.713206,-0.532519,0.096789,0.765235,2.301548,-0.026938,-0.315642,-0.385426,-0.442458,0.458197,...,0.099295,0.810668,-0.066553,-0.376010,0.247688,0.248135,-0.661240,0.838162,0.985800,-0.003363
100,-0.206210,0.352594,-0.198994,-0.541340,-0.498336,-0.250799,1.820070,-0.310376,0.780945,-0.225134,...,-0.071771,-0.149783,-0.630602,-0.381858,-0.303848,-0.410570,-0.804896,-0.409916,1.105324,-2.223065


In [16]:
# Set model names to numbers

In [17]:
col_lab = list(range(0,1384))
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm = RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [18]:
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.206538,2.906377,-0.170420,0.426654,4.482777,-0.250790,-0.309892,-0.490882,1.445626,-0.223756,...,-0.345971,-0.380637,-0.121216,-0.384348,-0.293211,-0.380950,-0.804896,-5.978133,1.105324,-3.074966
1,-0.199992,-0.375900,-0.181145,0.090514,-0.511342,-0.245346,-0.315248,-0.518109,-0.447039,-0.203248,...,0.005478,-0.285432,-0.564576,-0.375864,-0.276490,-0.429664,-0.702284,-2.482164,1.180027,-1.709069
2,-0.206664,-0.596656,0.064562,-0.834445,-0.511587,-0.200498,-0.315569,-0.146835,-0.445965,-0.220517,...,0.588175,0.127299,-0.620407,-0.369813,1.355054,0.490738,-0.810026,-5.723113,0.978329,-1.661477
3,-0.206959,-0.652848,-0.199506,-1.051992,-0.511260,-0.250757,-0.315690,-0.531651,-0.447218,-0.225224,...,-0.414731,-0.393139,-0.635052,-0.382570,-0.302146,-0.436147,-0.312361,0.451409,-0.493317,0.306939
4,-0.206414,-0.607214,-0.142333,-0.135244,-0.511261,-0.250779,-0.314809,-0.530276,-0.366058,-0.225169,...,-0.283245,-0.315174,1.199907,-0.343330,-0.283350,-0.238857,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.358051,1.170373,0.535112,0.557339,-0.139149,-0.221604,-0.294488,-0.465753,-0.089202,-0.127893,...,-0.301338,-0.008619,-0.463245,0.107149,-0.249296,-0.181547,4.094798,1.029004,2.151165,2.593270
98,0.395013,3.306250,0.176883,0.839464,0.098059,-0.159920,-0.292982,-0.471717,0.924219,-0.163577,...,-0.416086,-0.071533,-0.495524,-0.187121,-0.251338,-0.333898,3.889575,1.029004,1.762710,-0.060140
99,1.713206,-0.532519,0.096789,0.765235,2.301548,-0.026938,-0.315642,-0.385426,-0.442458,0.458197,...,0.099295,0.810668,-0.066553,-0.376010,0.247688,0.248135,-0.661240,0.838162,0.985800,-0.003363
100,-0.206210,0.352594,-0.198994,-0.541340,-0.498336,-0.250799,1.820070,-0.310376,0.780945,-0.225134,...,-0.071771,-0.149783,-0.630602,-0.381858,-0.303848,-0.410570,-0.804896,-0.409916,1.105324,-2.223065


In [19]:
RARPAT_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Rare_Patho_SNV_dataset_XY-named.txt", sep='\t')

In [20]:
y = RARPAT_prvcs['label']

In [21]:
y

0      0
1      0
2      0
3      0
4      0
      ..
97     1
98     1
99     1
100    0
101    1
Name: label, Length: 102, dtype: int64

In [22]:
RARPAT_pred = test_estimator.predict(RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm)

In [23]:
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score

print("Accuracy_model_REG: %.4f" % (accuracy_score(y, RARPAT_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, RARPAT_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, RARPAT_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , RARPAT_pred)))
print("ROC_PRC_model_REG: %.4f" % (average_precision_score(y , RARPAT_pred)))

Accuracy_model_REG: 0.5686
F1_model_REG: 0.0000
MCC_model_REG: 0.0000
ROC_AUC_model_REG: 0.5000
ROC_PRC_model_REG: 0.4314


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Load ASD dataset

In [24]:
ASD_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/ASD_0_107-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

Import conservation scores

In [25]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/ASD_denovo_SNV_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(      Name  priPhCons  priPhyloP  GerpN  GerpS
 0  ASD0001      0.064      0.462  0.427  0.427
 1  ASD0002      0.023      0.393  2.570  1.350
 2  ASD0003      0.002     -1.411  2.900 -4.580
 3  ASD0004      0.010     -0.650  0.626 -1.250
 4  ASD0005      0.019      0.462  4.400  3.480,
 (107, 5))

In [26]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,ASD0001,0.064,0.462,0.427,0.427
1,ASD0002,0.023,0.393,2.570,1.350
2,ASD0003,0.002,-1.411,2.900,-4.580
3,ASD0004,0.010,-0.650,0.626,-1.250
4,ASD0005,0.019,0.462,4.400,3.480
...,...,...,...,...,...
102,ASD0103,0.677,-0.532,3.260,-3.500
103,ASD0104,0.093,0.198,0.930,0.930
104,ASD0105,0.001,-1.824,2.380,-4.330
105,ASD0106,0.044,-1.067,2.880,0.648


Drop name column before concatenating

In [27]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [28]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.064,0.462,0.427,0.427
1,0.023,0.393,2.570,1.350
2,0.002,-1.411,2.900,-4.580
3,0.010,-0.650,0.626,-1.250
4,0.019,0.462,4.400,3.480
...,...,...,...,...
102,0.677,-0.532,3.260,-3.500
103,0.093,0.198,0.930,0.930
104,0.001,-1.824,2.380,-4.330
105,0.044,-1.067,2.880,0.648


Normalize conservation scores

In [29]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [30]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.411880,0.639408,-1.844404,0.252143
1,-0.538299,0.546163,-0.395911,0.582231
2,-0.603051,-1.891741,-0.172858,-1.538488
3,-0.578384,-0.863335,-1.709897,-0.347595
4,-0.550633,0.639408,0.841019,1.343973
...,...,...,...,...
102,1.478249,-0.703871,0.070472,-1.152253
103,-0.322461,0.282642,-1.504417,0.432028
104,-0.606134,-2.449864,-0.524336,-1.449082
105,-0.473548,-1.426863,-0.186377,0.331178


Concatenate chromatin feature scores and conservation scores

In [31]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
ASD_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([ASD_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [32]:
ASD_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.229695,-0.602858,-0.290626,-0.089052,-0.163430,-0.288711,-0.374899,-0.487998,-0.477156,-0.318228,...,-0.545594,-0.354866,-0.323475,-0.450354,-0.393204,-0.429022,-0.411880,0.639408,-1.844404,0.252143
1,-0.229542,-0.607343,-0.285806,0.696796,-0.466199,-0.288711,-0.372011,-0.445119,-0.462745,-0.319469,...,-0.535811,-0.428614,-0.507799,-0.450341,-0.392362,-0.331279,-0.538299,0.546163,-0.395911,0.582231
2,-0.229701,-0.664080,-0.290600,-0.174744,-0.453984,-0.288711,-0.370807,-0.534545,-0.476378,-0.319586,...,-0.543638,-0.424393,-0.374857,-0.436132,-0.388976,-0.426025,-0.603051,-1.891741,-0.172858,-1.538488
3,-0.229476,-0.584708,-0.290655,-0.789959,-0.462516,-0.288706,-0.362875,-0.532215,-0.300705,-0.318731,...,-0.475389,-0.288862,1.355761,-0.446610,-0.392134,-0.398144,-0.578384,-0.863335,-1.709897,-0.347595
4,-0.225621,1.027578,-0.278320,-0.599621,0.840352,-0.288705,-0.152654,-0.550221,-0.469771,-0.319039,...,-0.556688,-0.389388,-0.598057,-0.362129,-0.394222,-0.415341,-0.550633,0.639408,0.841019,1.343973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.214256,-0.687476,-0.016445,-0.741008,-0.467378,-0.073159,-0.376444,-0.554693,-0.475152,-0.310869,...,-0.394164,-0.244371,-0.620546,-0.432624,-0.283490,-0.293363,1.478249,-0.703871,0.070472,-1.152253
103,-0.208566,-0.606097,0.049127,-0.549999,-0.467226,-0.140918,-0.224215,0.030520,-0.138759,-0.291726,...,0.079465,-0.430403,-0.487613,-0.384336,-0.120339,1.260034,-0.322461,0.282642,-1.504417,0.432028
104,-0.229132,0.378171,-0.278365,-0.783417,-0.409220,-0.283918,-0.325528,-0.119599,-0.337254,-0.318836,...,-0.359191,-0.269229,-0.161170,-0.440565,0.254996,-0.370670,-0.606134,-2.449864,-0.524336,-1.449082
105,-0.225772,-0.358270,-0.251520,2.095907,-0.113983,-0.288496,0.223446,0.162861,0.770044,-0.319727,...,0.025542,-0.288175,0.650125,1.904538,-0.381119,-0.345916,-0.473548,-1.426863,-0.186377,0.331178


In [33]:
# Set model names to numbers

In [34]:
col_lab = list(range(0,1384))
ASD_abs_diff_plus_log_odds_and_cons_scores_norm = ASD_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [35]:
ASD_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.229695,-0.602858,-0.290626,-0.089052,-0.163430,-0.288711,-0.374899,-0.487998,-0.477156,-0.318228,...,-0.545594,-0.354866,-0.323475,-0.450354,-0.393204,-0.429022,-0.411880,0.639408,-1.844404,0.252143
1,-0.229542,-0.607343,-0.285806,0.696796,-0.466199,-0.288711,-0.372011,-0.445119,-0.462745,-0.319469,...,-0.535811,-0.428614,-0.507799,-0.450341,-0.392362,-0.331279,-0.538299,0.546163,-0.395911,0.582231
2,-0.229701,-0.664080,-0.290600,-0.174744,-0.453984,-0.288711,-0.370807,-0.534545,-0.476378,-0.319586,...,-0.543638,-0.424393,-0.374857,-0.436132,-0.388976,-0.426025,-0.603051,-1.891741,-0.172858,-1.538488
3,-0.229476,-0.584708,-0.290655,-0.789959,-0.462516,-0.288706,-0.362875,-0.532215,-0.300705,-0.318731,...,-0.475389,-0.288862,1.355761,-0.446610,-0.392134,-0.398144,-0.578384,-0.863335,-1.709897,-0.347595
4,-0.225621,1.027578,-0.278320,-0.599621,0.840352,-0.288705,-0.152654,-0.550221,-0.469771,-0.319039,...,-0.556688,-0.389388,-0.598057,-0.362129,-0.394222,-0.415341,-0.550633,0.639408,0.841019,1.343973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.214256,-0.687476,-0.016445,-0.741008,-0.467378,-0.073159,-0.376444,-0.554693,-0.475152,-0.310869,...,-0.394164,-0.244371,-0.620546,-0.432624,-0.283490,-0.293363,1.478249,-0.703871,0.070472,-1.152253
103,-0.208566,-0.606097,0.049127,-0.549999,-0.467226,-0.140918,-0.224215,0.030520,-0.138759,-0.291726,...,0.079465,-0.430403,-0.487613,-0.384336,-0.120339,1.260034,-0.322461,0.282642,-1.504417,0.432028
104,-0.229132,0.378171,-0.278365,-0.783417,-0.409220,-0.283918,-0.325528,-0.119599,-0.337254,-0.318836,...,-0.359191,-0.269229,-0.161170,-0.440565,0.254996,-0.370670,-0.606134,-2.449864,-0.524336,-1.449082
105,-0.225772,-0.358270,-0.251520,2.095907,-0.113983,-0.288496,0.223446,0.162861,0.770044,-0.319727,...,0.025542,-0.288175,0.650125,1.904538,-0.381119,-0.345916,-0.473548,-1.426863,-0.186377,0.331178


In [36]:
ASD_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/ASD_denovo_SNV_dataset_XY-named.txt", sep='\t')

In [37]:
y = ASD_prvcs['label']

In [38]:
ASD_pred = test_estimator.predict(ASD_abs_diff_plus_log_odds_and_cons_scores_norm)

In [39]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, ASD_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, ASD_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, ASD_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , ASD_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , ASD_pred)))

Accuracy_model_REG: 0.4766
F1_model_REG: 0.0345
MCC_model_REG: 0.0910
ROC_AUC_model_REG: 0.5088
Avg_Recall_Score_model_REG: 0.5409


Load GTEx dataset

In [40]:
GTEX_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/GTEX_0_796-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [41]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GTEx_eQTL_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(       Name  priPhCons  priPhyloP  GerpN  GerpS
 0  GTEX0001      0.007      0.375  1.150  1.150
 1  GTEX0002      0.115     -0.033  1.909 -0.200
 2  GTEX0003      0.115     -0.033  1.909 -0.200
 3  GTEX0004      0.000     -0.600  1.960 -0.495
 4  GTEX0005      0.026      0.387  1.250 -1.360,
 (796, 5))

In [42]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,GTEX0001,0.007,0.375,1.150,1.150
1,GTEX0002,0.115,-0.033,1.909,-0.200
2,GTEX0003,0.115,-0.033,1.909,-0.200
3,GTEX0004,0.000,-0.600,1.960,-0.495
4,GTEX0005,0.026,0.387,1.250,-1.360
...,...,...,...,...,...
791,GTEX0792,0.115,-0.033,1.909,-0.200
792,GTEX0793,0.034,0.146,0.149,0.149
793,GTEX0794,0.115,-0.033,1.909,-0.200
794,GTEX0795,0.115,-0.033,1.909,-0.200


Drop name column before concatenating

In [43]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [44]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.007,0.375,1.150,1.150
1,0.115,-0.033,1.909,-0.200
2,0.115,-0.033,1.909,-0.200
3,0.000,-0.600,1.960,-0.495
4,0.026,0.387,1.250,-1.360
...,...,...,...,...
791,0.115,-0.033,1.909,-0.200
792,0.034,0.146,0.149,0.149
793,0.115,-0.033,1.909,-0.200
794,0.115,-0.033,1.909,-0.200


Normalize conservation scores

In [45]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [46]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.909252,1.207975,-0.986787,1.188003
1,0.007110,0.063071,-0.059739,0.073786
2,0.007110,0.063071,-0.059739,0.073786
3,-0.968646,-1.528009,0.002553,-0.169691
4,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...
791,0.007110,0.063071,-0.059739,0.073786
792,-0.680161,0.565370,-2.209417,0.361832
793,0.007110,0.063071,-0.059739,0.073786
794,0.007110,0.063071,-0.059739,0.073786


Concatenate chromatin feature scores and conservation scores

In [47]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([GTEX_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [48]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.178355,-0.485832,-0.224357,-0.385733,-0.429759,-0.209388,-0.244968,-0.468064,-0.360461,-0.188559,...,-0.289796,-0.205919,-0.389517,-0.261726,-0.260624,-0.351443,-0.909252,1.207975,-0.986787,1.188003
1,-0.178352,-0.246669,-0.221942,0.956891,3.155623,-0.209369,-0.240326,0.142329,2.235803,-0.153395,...,-0.169110,0.110825,0.377795,-0.217207,-0.253757,-0.228107,0.007110,0.063071,-0.059739,0.073786
2,-0.178302,-0.517851,-0.224342,-0.729727,-0.162195,-0.209389,-0.243257,-0.470614,-0.358797,-0.188556,...,-0.265715,-0.264661,-0.455395,-0.263826,-0.255614,-0.323819,0.007110,0.063071,-0.059739,0.073786
3,-0.178280,-0.375803,-0.223981,-0.385835,0.343213,-0.209360,-0.244811,-0.464363,-0.129742,-0.187047,...,-0.161633,-0.151567,0.832964,-0.253708,-0.236141,-0.281717,-0.968646,-1.528009,0.002553,-0.169691
4,-0.178030,0.393779,-0.223855,-0.137609,1.212573,-0.209318,4.905893,2.569003,1.323787,-0.184953,...,-0.285058,-0.317789,-0.466421,-0.265439,-0.254904,-0.169747,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0.198956,-0.526851,0.979379,-0.594531,-0.429889,-0.201291,-0.244975,-0.471995,-0.360122,-0.184072,...,0.367565,0.833365,-0.147515,3.042131,0.044801,-0.188965,0.007110,0.063071,-0.059739,0.073786
792,0.018531,-0.162753,0.481206,0.670652,-0.429505,-0.190066,-0.237247,-0.270271,0.260414,-0.146905,...,-0.092712,-0.073844,-0.302482,-0.021024,-0.050756,-0.149935,-0.680161,0.565370,-2.209417,0.361832
793,-0.178362,-0.528672,-0.224362,-0.726840,-0.429889,-0.209386,-0.244985,-0.470701,-0.360351,-0.188561,...,-0.277991,-0.332510,-0.463221,-0.264702,-0.256798,-0.348923,0.007110,0.063071,-0.059739,0.073786
794,0.076135,-0.512977,0.433604,-0.214048,-0.429695,-0.193048,-0.241100,0.658683,-0.340889,-0.169407,...,1.220882,0.182769,-0.101256,-0.004808,-0.217325,-0.007000,0.007110,0.063071,-0.059739,0.073786


In [49]:
# Set model names to numbers

In [50]:
col_lab = list(range(0,1384))
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = GTEX_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [51]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.178355,-0.485832,-0.224357,-0.385733,-0.429759,-0.209388,-0.244968,-0.468064,-0.360461,-0.188559,...,-0.289796,-0.205919,-0.389517,-0.261726,-0.260624,-0.351443,-0.909252,1.207975,-0.986787,1.188003
1,-0.178352,-0.246669,-0.221942,0.956891,3.155623,-0.209369,-0.240326,0.142329,2.235803,-0.153395,...,-0.169110,0.110825,0.377795,-0.217207,-0.253757,-0.228107,0.007110,0.063071,-0.059739,0.073786
2,-0.178302,-0.517851,-0.224342,-0.729727,-0.162195,-0.209389,-0.243257,-0.470614,-0.358797,-0.188556,...,-0.265715,-0.264661,-0.455395,-0.263826,-0.255614,-0.323819,0.007110,0.063071,-0.059739,0.073786
3,-0.178280,-0.375803,-0.223981,-0.385835,0.343213,-0.209360,-0.244811,-0.464363,-0.129742,-0.187047,...,-0.161633,-0.151567,0.832964,-0.253708,-0.236141,-0.281717,-0.968646,-1.528009,0.002553,-0.169691
4,-0.178030,0.393779,-0.223855,-0.137609,1.212573,-0.209318,4.905893,2.569003,1.323787,-0.184953,...,-0.285058,-0.317789,-0.466421,-0.265439,-0.254904,-0.169747,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0.198956,-0.526851,0.979379,-0.594531,-0.429889,-0.201291,-0.244975,-0.471995,-0.360122,-0.184072,...,0.367565,0.833365,-0.147515,3.042131,0.044801,-0.188965,0.007110,0.063071,-0.059739,0.073786
792,0.018531,-0.162753,0.481206,0.670652,-0.429505,-0.190066,-0.237247,-0.270271,0.260414,-0.146905,...,-0.092712,-0.073844,-0.302482,-0.021024,-0.050756,-0.149935,-0.680161,0.565370,-2.209417,0.361832
793,-0.178362,-0.528672,-0.224362,-0.726840,-0.429889,-0.209386,-0.244985,-0.470701,-0.360351,-0.188561,...,-0.277991,-0.332510,-0.463221,-0.264702,-0.256798,-0.348923,0.007110,0.063071,-0.059739,0.073786
794,0.076135,-0.512977,0.433604,-0.214048,-0.429695,-0.193048,-0.241100,0.658683,-0.340889,-0.169407,...,1.220882,0.182769,-0.101256,-0.004808,-0.217325,-0.007000,0.007110,0.063071,-0.059739,0.073786


In [52]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = GTEX_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [53]:
GTEX_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GTEx_eQTL_dataset_XY-named.txt", sep='\t')

In [54]:
y = GTEX_prvcs['label']

In [55]:
GTEX_pred = test_estimator.predict(GTEX_abs_diff_plus_log_odds_and_cons_scores_norm)

In [56]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, GTEX_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, GTEX_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, GTEX_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , GTEX_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , GTEX_pred)))

Accuracy_model_REG: 0.4912
F1_model_REG: 0.0146
MCC_model_REG: -0.0015
ROC_AUC_model_REG: 0.4999
Avg_Recall_Score_model_REG: 0.5087


Load Somatic eQTL dataset

In [57]:
SOMEQTL_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/SOMEQTL_0_7513-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [58]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Somatic_eQTL_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(           Name  priPhCons  priPhyloP  GerpN  GerpS
 0  SOMEQTL00001      0.236      0.094   0.00  0.000
 1  SOMEQTL00002      0.006     -0.398   3.13 -1.420
 2  SOMEQTL00003      0.008     -0.821   4.49 -3.810
 3  SOMEQTL00004      0.008     -0.146   3.32  1.450
 4  SOMEQTL00005      0.010     -0.357   2.51  0.611,
 (7513, 5))

In [59]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [60]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.236,0.094,0.00,0.000
1,0.006,-0.398,3.13,-1.420
2,0.008,-0.821,4.49,-3.810
3,0.008,-0.146,3.32,1.450
4,0.010,-0.357,2.51,0.611
...,...,...,...,...
7508,0.266,0.124,0.81,-0.189
7509,0.505,0.487,3.68,1.190
7510,0.014,-0.243,3.68,-1.990
7511,0.007,-0.483,2.97,0.243


In [61]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [62]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,1.213221,0.183020,-2.072384,0.268571
1,-0.453869,-0.510635,0.063053,-0.252106
2,-0.439372,-1.107009,0.990910,-1.128457
3,-0.439372,-0.155348,0.192680,0.800248
4,-0.424876,-0.452830,-0.359941,0.492609
...,...,...,...,...
7508,1.430667,0.225316,-1.519763,0.199270
7509,3.162991,0.737098,0.438289,0.704913
7510,-0.395883,-0.292105,0.438289,-0.461110
7511,-0.446620,-0.630474,-0.046107,0.357673


In [63]:
SOMEQTL_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwMcf7CtcfUniPk151-ran.1,UwNb4CtcfUniPk151-ran.1,UwNhdfneoCtcfUniPk151-ran.1,UwNhekCtcfUniPk151-ran.1,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1
0,-0.144903,0.195737,-0.182222,-0.108920,0.215997,-0.163565,-0.275462,-0.144383,-0.338155,-0.164474,...,-0.310175,-0.211341,-0.344102,-0.265758,-0.293365,-0.196343,-0.520804,-0.231279,-0.203031,-0.333002
1,-0.144924,-0.568833,-0.182462,-0.683622,-0.434228,-0.163565,-0.275869,-0.512477,-0.357202,-0.164471,...,-0.342034,-0.200050,-0.315634,-0.105513,-0.296193,-0.337912,-0.506100,-0.237790,-0.215480,-0.324308
2,-0.144511,-0.200158,-0.181925,-0.358322,-0.394926,-0.163529,-0.275796,-0.468553,-0.354824,-0.158311,...,0.963121,-0.209746,-0.429988,-0.135167,-0.273474,0.019215,-0.320742,0.107533,-0.203012,-0.307684
3,-0.144869,-0.526162,-0.180201,1.135465,0.723396,-0.163565,-0.237328,0.449306,-0.355735,-0.163939,...,-0.307590,-0.185465,-0.407711,-0.259176,-0.181607,-0.258805,-0.168069,-0.238298,-0.211773,-0.291270
4,-0.144908,-0.247975,-0.141101,-0.480016,-0.216463,-0.150550,-0.260551,0.054881,-0.087934,-0.103180,...,0.738351,0.587462,-0.143264,1.144424,0.048478,-0.293651,-0.452174,-0.230422,0.445050,0.516132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7508,-0.144305,-0.367277,-0.182471,-0.244942,-0.415514,-0.163563,-0.243906,-0.393012,-0.113778,-0.149147,...,-0.336289,-0.215802,-0.280409,-0.196786,-0.278953,-0.347581,0.610747,-0.238620,-0.216403,-0.295181
7509,-0.144923,-0.547270,-0.182243,1.006641,-0.432987,-0.163565,-0.275755,1.251890,-0.357355,-0.164476,...,-0.336786,-0.217042,-0.349688,-0.267488,-0.205523,-0.333389,-0.438623,-0.235375,-0.209342,-0.246323
7510,-0.144899,-0.591999,-0.182477,-0.479671,-0.434303,-0.163564,-0.275865,-0.517299,-0.357452,-0.164468,...,-0.328957,-0.143269,-0.103965,-0.264673,-0.296925,-0.072538,-0.204852,-0.234325,-0.212516,-0.237920
7511,-0.143923,0.127016,-0.182166,0.305013,-0.262310,-0.163462,-0.275867,-0.508464,-0.357306,-0.164449,...,-0.300933,-0.219915,-0.351700,0.076195,-0.290894,0.259330,-0.246323,-0.233290,-0.214649,-0.301657


In [64]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([SOMEQTL_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [65]:
col_lab = list(range(0,1384))
SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm = SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [66]:
SOMEQTL_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Somatic_eQTL_dataset_XY-named.txt", sep='\t')

In [67]:
y = SOMEQTL_prvcs['label']

In [68]:
SOMEQTL_pred = test_estimator.predict(SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm)

In [69]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, SOMEQTL_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, SOMEQTL_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, SOMEQTL_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , SOMEQTL_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , SOMEQTL_pred)))

Accuracy_model_REG: 0.5096
F1_model_REG: 0.0466
MCC_model_REG: 0.0805
ROC_AUC_model_REG: 0.5096
Avg_Recall_Score_model_REG: 0.5079


Load GWAS E-8 Test Dataset

In [70]:
GWAS8_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/GWAS8_0_21725-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [71]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GWAS_SNP_5E-8_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(         Name  priPhCons  priPhyloP  GerpN  GerpS
 0  GWAS800001      0.025      0.379  1.380  1.380
 1  GWAS800002      0.130      0.158  0.122  0.122
 2  GWAS800003      0.019     -2.070  0.000  0.000
 3  GWAS800004      0.030      0.292  0.185  0.185
 4  GWAS800005      0.010     -0.276  3.070 -6.140,
 (21725, 5))

In [72]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [73]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.025,0.379,1.380,1.380
1,0.130,0.158,0.122,0.122
2,0.019,-2.070,0.000,0.000
3,0.030,0.292,0.185,0.185
4,0.010,-0.276,3.070,-6.140
...,...,...,...,...
21720,0.024,-1.400,0.000,0.000
21721,0.485,0.232,0.225,0.225
21722,0.068,0.232,0.614,0.614
21723,0.118,0.428,2.110,-0.425


In [74]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [75]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.364343,0.648509,-0.409791,0.801216
1,0.147629,0.353529,-1.174989,0.232482
2,-0.393598,-2.620293,-1.249198,0.177327
3,-0.339963,0.532386,-1.136669,0.260964
4,-0.437482,-0.225752,0.618179,-2.598529
...,...,...,...,...
21720,-0.369219,-1.726011,-1.249198,0.177327
21721,1.878580,0.452301,-1.112338,0.279048
21722,-0.154678,0.452301,-0.875723,0.454912
21723,0.089118,0.713912,0.034244,-0.014813


In [76]:
GWAS8_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwMcf7CtcfUniPk151-ran.1,UwNb4CtcfUniPk151-ran.1,UwNhdfneoCtcfUniPk151-ran.1,UwNhekCtcfUniPk151-ran.1,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1
0,-0.129674,-0.016937,-0.168857,-0.612777,0.550565,-0.147991,-0.040435,0.180421,-0.274353,-0.144067,...,-0.256078,-0.185529,-0.303280,-0.211185,-0.209141,0.433344,-0.291560,-0.204089,-0.157390,-0.227529
1,-0.129688,-0.514675,-0.168928,-0.653093,-0.343035,-0.147991,-0.190363,-0.455714,-0.275306,-0.144065,...,-0.244750,-0.185768,-0.104093,-0.218978,-0.217061,0.138904,2.757180,-0.196721,-0.164951,-0.263949
2,-0.129693,-0.460933,-0.168930,-0.647238,-0.343096,-0.147991,-0.190363,-0.427523,-0.275103,-0.144068,...,-0.300975,-0.185886,-0.369799,-0.281471,-0.232661,-0.335410,-0.443616,-0.206673,-0.172934,-0.280523
3,-0.129687,-0.486871,-0.168916,-0.632188,-0.343080,-0.147991,-0.190187,-0.438925,-0.274875,-0.144065,...,-0.252349,-0.178775,-0.372817,-0.159553,-0.213912,-0.178979,-0.421578,-0.199231,-0.168021,-0.242783
4,-0.129687,2.089718,-0.168791,0.717755,4.252767,-0.147991,-0.190200,0.314753,-0.274302,-0.144064,...,-0.293252,-0.174759,-0.180935,-0.272887,-0.179814,-0.308192,0.165941,-0.208023,-0.166212,-0.258519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21720,-0.157738,4.305586,-0.207253,2.197794,7.406568,-0.185714,2.152754,2.834862,-0.297445,-0.179866,...,0.430249,-0.184329,1.015918,-0.222168,-0.219105,-0.217155,1.783749,-0.254743,-0.212003,1.471085
21721,-0.158093,-0.328668,-0.207312,0.351229,0.727849,-0.185714,-0.247555,-0.085751,-0.350369,-0.184390,...,-0.334323,-0.227596,-0.453991,-0.334762,-0.036078,-0.296810,-0.114123,-0.266907,-0.222860,-0.307631
21722,-0.158077,0.153827,-0.207308,-0.652034,-0.424610,-0.185714,-0.247629,-0.473976,-0.338721,-0.184412,...,-0.360098,-0.221988,-0.463599,-0.282932,-0.279896,-0.253956,-0.519972,-0.266917,-0.225591,-0.328264
21723,-0.158063,-0.526058,-0.207083,0.427481,-0.418511,-0.185681,-0.247382,0.057424,-0.350206,-0.184406,...,1.801114,-0.212217,-0.115858,0.098969,-0.279897,-0.312295,-0.512031,-0.266037,-0.094657,-0.043703


In [77]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
GWAS8_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([GWAS8_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [78]:
col_lab = list(range(0,1384))
GWAS8_abs_diff_plus_log_odds_and_cons_scores_norm = GWAS8_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [79]:
GWAS8_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GWAS_SNP_5E-8_dataset_XY-named.txt", sep='\t')

In [80]:
y = GWAS8_prvcs['label']

In [81]:
GWAS8_pred = test_estimator.predict(GWAS8_abs_diff_plus_log_odds_and_cons_scores_norm)

In [82]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, GWAS8_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, GWAS8_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, GWAS8_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , GWAS8_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , GWAS8_pred)))

Accuracy_model_REG: 0.5202
F1_model_REG: 0.1150
MCC_model_REG: 0.0889
ROC_AUC_model_REG: 0.5183
Avg_Recall_Score_model_REG: 0.5108


Load GWAS E-5 Test Dataset

In [83]:
GWAS5_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/GWAS5_0_60593-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [84]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GWAS_SNP_1E-5_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(         Name  priPhCons  priPhyloP   GerpN   GerpS
 0  GWAS500001      0.005     -0.863  0.2350 -0.4700
 1  GWAS500002      0.184      0.178  0.1130  0.1130
 2  GWAS500003      0.721      0.249  1.1300 -1.5000
 3  GWAS500004      0.002     -0.707  1.2600 -2.5200
 4  GWAS500005      0.011      0.124  0.0465  0.0465,
 (60593, 5))

In [85]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [86]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.005,-0.863,0.2350,-0.4700
1,0.184,0.178,0.1130,0.1130
2,0.721,0.249,1.1300,-1.5000
3,0.002,-0.707,1.2600,-2.5200
4,0.011,0.124,0.0465,0.0465
...,...,...,...,...
60588,0.068,0.232,0.6140,0.6140
60589,0.021,-0.667,0.6370,-1.2700
60590,0.118,0.428,2.1100,-0.4250
60591,0.007,0.342,1.1700,-2.0700


In [87]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [88]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.448015,-1.022180,-1.088748,-0.013349
1,0.508990,0.387511,-1.165133,0.260447
2,3.380003,0.483657,-0.528379,-0.497069
3,-0.464054,-0.810929,-0.446984,-0.976094
4,-0.415937,0.314386,-1.206770,0.229216
...,...,...,...,...
60588,-0.111192,0.460636,-0.851452,0.495732
60589,-0.362473,-0.756762,-0.837051,-0.389054
60590,0.156128,0.726054,0.085210,0.007785
60591,-0.437322,0.609595,-0.503334,-0.764759


In [89]:
GWAS5_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwMcf7CtcfUniPk151-ran.1,UwNb4CtcfUniPk151-ran.1,UwNhdfneoCtcfUniPk151-ran.1,UwNhekCtcfUniPk151-ran.1,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1
0,-0.126976,-0.393417,-0.170452,-0.632455,-0.351548,-0.150167,-0.187212,-0.453192,-0.273088,-0.139205,...,-0.303147,-0.184363,-0.346480,-0.278705,-0.245373,-0.026802,-0.414323,-0.204780,-0.172762,-0.236403
1,-0.126990,-0.520931,-0.170455,-0.654701,-0.350676,-0.150167,-0.186781,-0.456035,-0.273283,-0.139208,...,-0.303420,-0.181288,-0.377618,-0.249286,-0.245114,-0.334825,-0.430574,-0.206213,-0.173918,-0.273375
2,-0.126986,-0.070269,-0.170454,0.927166,-0.350216,-0.150167,-0.187213,-0.334202,-0.272510,-0.139189,...,-0.304691,-0.177633,-0.118552,-0.179786,-0.240982,-0.242290,-0.314668,-0.205791,-0.175968,-0.275430
3,-0.126972,0.472467,-0.170417,0.047169,1.160915,-0.150167,0.003889,1.015140,5.264016,-0.139198,...,-0.271336,-0.176117,-0.365341,-0.147491,0.296357,-0.243272,-0.436960,-0.194768,-0.174534,-0.232629
4,0.187447,-0.504219,1.096573,0.861540,-0.351674,9.669863,-0.187209,-0.451860,-0.273132,-0.138323,...,6.678783,-0.060616,15.457387,10.156484,0.314958,10.032747,7.507830,7.036562,8.389212,1.659160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60588,-0.149684,0.137838,-0.191819,-0.651413,-0.427758,-0.173886,-0.243942,-0.484972,-0.333870,-0.172602,...,-0.341056,-0.207251,-0.452988,-0.268509,-0.260146,-0.245400,-0.497364,-0.241367,-0.209211,-0.312707
60589,-0.149685,0.619081,-0.191720,0.859545,1.969019,-0.173885,-0.237051,-0.438271,0.581553,-0.172599,...,-0.334840,-0.213731,-0.459995,-0.220215,-0.162399,-0.204531,-0.465757,-0.237332,-0.201387,-0.319978
60590,-0.149669,-0.552165,-0.191578,0.431885,-0.421645,-0.173849,-0.243689,0.040045,-0.346652,-0.172596,...,1.878416,-0.197004,-0.111108,0.112052,-0.260145,-0.309647,-0.489733,-0.240474,-0.077839,-0.015910
60591,-0.149715,-0.307065,-0.191806,-0.597963,-0.430148,-0.173882,-0.243784,-0.522171,-0.347194,-0.172567,...,-0.334872,-0.211631,-0.436338,-0.267527,-0.123560,-0.374008,-0.484017,-0.240110,-0.208981,-0.295278


In [90]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
GWAS5_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([GWAS5_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [91]:
col_lab = list(range(0,1384))
GWAS5_abs_diff_plus_log_odds_and_cons_scores_norm = GWAS5_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [92]:
GWAS5_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GWAS_SNP_1E-5_dataset_XY-named.txt", sep='\t')

In [93]:
y = GWAS5_prvcs['label']

In [94]:
GWAS5_pred = test_estimator.predict(GWAS5_abs_diff_plus_log_odds_and_cons_scores_norm)

In [95]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, GWAS5_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, GWAS5_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, GWAS5_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , GWAS5_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , GWAS5_pred)))

Accuracy_model_REG: 0.5171
F1_model_REG: 0.1148
MCC_model_REG: 0.0730
ROC_AUC_model_REG: 0.5155
Avg_Recall_Score_model_REG: 0.5085


Load Brown eQTL dataset

In [96]:
BWN_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/BWN_0_67635-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [97]:
BWN_abs_diff_plus_log_odds_scores_norm.shape

(67635, 1380)

In [98]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Brown_eQTL_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(       Name  priPhCons  priPhyloP  GerpN   GerpS
 0  BRN00001      0.028     -1.870   0.00  0.0000
 1  BRN00002      0.000     -0.496   1.31 -2.4600
 2  BRN00003      0.001     -0.493   2.12 -0.0176
 3  BRN00004      0.000     -0.301   2.93 -5.8500
 4  BRN00005      0.002      0.392   2.69 -1.1600,
 (67635, 5))

In [99]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [100]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.028,-1.870,0.00,0.0000
1,0.000,-0.496,1.31,-2.4600
2,0.001,-0.493,2.12,-0.0176
3,0.000,-0.301,2.93,-5.8500
4,0.002,0.392,2.69,-1.1600
...,...,...,...,...
67630,0.115,-0.033,0.00,0.0000
67631,0.020,0.144,0.00,0.0000
67632,0.206,0.185,2.53,0.6080
67633,0.035,0.121,0.00,0.0000


In [101]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [102]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.307058,-2.413248,-1.272719,0.246652
1,-0.476589,-0.531261,-0.417647,-0.933069
2,-0.470535,-0.527152,0.111061,0.238212
3,-0.476589,-0.264166,0.639770,-2.558783
4,-0.464480,0.685046,0.483116,-0.309639
...,...,...,...,...
67630,0.219702,0.102917,-1.272719,0.246652
67631,-0.355495,0.345356,-1.272719,0.246652
67632,0.770680,0.401515,0.378679,0.538225
67633,-0.264675,0.313853,-1.272719,0.246652


In [103]:
BWN_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwMcf7CtcfUniPk151-ran.1,UwNb4CtcfUniPk151-ran.1,UwNhdfneoCtcfUniPk151-ran.1,UwNhekCtcfUniPk151-ran.1,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1
0,-0.135348,-0.345069,-0.168924,0.863123,-0.034798,-0.037808,-0.200140,-0.418137,-0.281413,0.576187,...,0.108417,-0.147086,0.237138,-0.222720,0.020309,-0.064837,0.067443,0.089623,-0.184572,0.017552
1,-0.136769,-0.508324,-0.177759,-0.700927,-0.384701,-0.157779,-0.201242,-0.490331,-0.304740,-0.159794,...,-0.319620,-0.193400,-0.411612,-0.271661,-0.248266,-0.335923,-0.475786,-0.221497,-0.182946,-0.266036
2,0.002618,0.139306,1.125427,0.045734,0.050726,-0.157706,0.710525,1.317889,0.011336,-0.129497,...,-0.252883,-0.164831,0.620072,0.005501,-0.241039,0.271412,0.162022,5.676635,-0.102966,-0.145503
3,-0.136750,-0.556679,-0.177762,-0.705941,-0.384427,-0.157779,-0.201277,-0.491218,-0.304722,-0.159793,...,-0.317318,-0.191479,-0.405643,-0.303477,-0.258323,-0.335417,-0.480505,-0.216404,-0.181079,-0.293622
4,-0.136769,-0.554759,-0.177761,-0.708032,-0.384690,-0.157779,-0.201269,-0.490493,-0.304724,-0.159793,...,-0.292412,-0.194009,-0.403530,-0.288284,-0.188400,-0.242840,-0.478283,-0.219369,-0.179563,-0.267128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67630,0.426762,-0.519993,2.389886,-0.667262,-0.392921,1.469699,-0.242239,-0.475929,-0.333431,0.132209,...,0.545519,-0.017602,1.987220,0.871608,-0.283313,0.273685,-0.011059,0.329961,-0.199151,0.825402
67631,-0.163057,-0.387676,-0.203723,-0.704594,-0.394300,-0.183098,-0.242126,-0.481876,-0.333707,-0.177585,...,-0.347135,-0.237054,-0.413078,-0.274326,-0.264960,-0.327722,-0.501083,-0.269499,-0.227322,-0.325402
67632,-0.162968,-0.542903,-0.148704,-0.616780,-0.394467,-0.180621,-0.242148,-0.482742,-0.013730,-0.176840,...,-0.326373,-0.218202,0.329758,-0.263105,-0.185263,-0.341422,-0.444358,-0.260462,-0.229031,-0.348364
67633,-0.163057,-0.542535,-0.203727,-0.733295,-0.392377,-0.183098,-0.242255,-0.483691,-0.333663,-0.177584,...,-0.346243,-0.237949,-0.413319,-0.300008,-0.288435,-0.350085,-0.505341,-0.268147,-0.232223,-0.332598


In [104]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
BWN_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([BWN_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [105]:
BWN_abs_diff_plus_log_odds_and_cons_scores_norm.shape

(67635, 1384)

In [106]:
col_lab = list(range(0,1384))
BWN_abs_diff_plus_log_odds_and_cons_scores_norm = BWN_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [107]:
BWN_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Brown_eQTL_dataset_XY-named.txt", sep='\t')

In [108]:
y = BWN_prvcs['label']

In [109]:
BWN_pred = test_estimator.predict(BWN_abs_diff_plus_log_odds_and_cons_scores_norm)

In [110]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, BWN_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, BWN_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, BWN_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , BWN_pred)))
print("Avg_Recall_Score_model_REG: %.4f" % (average_precision_score(y , BWN_pred)))

Accuracy_model_REG: 0.5475
F1_model_REG: 0.1021
MCC_model_REG: 0.0542
ROC_AUC_model_REG: 0.5111
Avg_Recall_Score_model_REG: 0.4671
