## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [None]:
# already set up on Expanse; toggle for colab

# pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] pyfaidx

Set path prefix for Expanse

In [1]:
path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

Import SNP datasets

In [2]:
import pandas as pd

abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/SNP_PRVCS_REG_test_all-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')
abs_diff_plus_log_odds_scores_norm.head(), abs_diff_plus_log_odds_scores_norm.shape


(   BroadDnd41CtcfUniPk151-ran  BroadDnd41Ezh239875UniPk151-ran  \
 0                   -0.154390                        -0.517654   
 1                   -0.154486                        -0.526337   
 2                   -0.154486                        -0.528125   
 3                   -0.153836                        -0.513125   
 4                   -0.154373                        -0.490701   
 
    BroadGm12878CtcfUniPk151-ran  BroadGm12878Ezh239875UniPk151-ran  \
 0                     -0.198518                           1.045550   
 1                     -0.198527                           1.097025   
 2                     -0.198532                          -0.595088   
 3                     -0.157125                           0.776755   
 4                     -0.197923                          -0.629536   
 
    BroadH1hescChd1a301218aUniPk151-ran  BroadH1hescCtcfUniPk151-ran  \
 0                            -0.360801                    -0.179535   
 1                      

Import conservation scores

In [3]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/REG_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(       Name  priPhCons  priPhyloP  GerpN  GerpS
 0  SNP00001      0.000      0.000  0.000  0.000
 1  SNP00002      0.008      0.094  0.432 -0.864
 2  SNP00003      0.001     -2.101  0.523 -1.050
 3  SNP00004      0.008      0.176  0.109  0.109
 4  SNP00005      0.035      0.292  0.185  0.185,
 (61170, 5))

In [4]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,SNP00001,0.000,0.000,0.000,0.000
1,SNP00002,0.008,0.094,0.432,-0.864
2,SNP00003,0.001,-2.101,0.523,-1.050
3,SNP00004,0.008,0.176,0.109,0.109
4,SNP00005,0.035,0.292,0.185,0.185
...,...,...,...,...,...
61165,SNP61166,0.050,-1.400,0.000,0.000
61166,SNP61167,0.115,-0.033,0.000,0.000
61167,SNP61168,0.035,0.121,0.000,0.000
61168,SNP61169,0.012,0.342,1.590,-1.990


Drop name column before concatenating

In [5]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [6]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.000,0.000,0.000,0.000
1,0.008,0.094,0.432,-0.864
2,0.001,-2.101,0.523,-1.050
3,0.008,0.176,0.109,0.109
4,0.035,0.292,0.185,0.185
...,...,...,...,...
61165,0.050,-1.400,0.000,0.000
61166,0.115,-0.033,0.000,0.000
61167,0.035,0.121,0.000,0.000
61168,0.012,0.342,1.590,-1.990


Normalize conservation scores

In [7]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))



In [8]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.483330,0.156890,-1.045364,0.221493
1,-0.430473,0.287881,-0.756156,-0.253771
2,-0.476723,-2.770889,-0.695235,-0.356084
3,-0.430473,0.402149,-0.972393,0.281451
4,-0.252079,0.563797,-0.921513,0.323257
...,...,...,...,...
61165,-0.152972,-1.794034,-1.045364,0.221493
61166,0.276493,0.110904,-1.045364,0.221493
61167,-0.252079,0.325506,-1.045364,0.221493
61168,-0.404044,0.633473,0.019083,-0.873154


Concatenate chromatin feature scores and conservation scores

In [9]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [10]:
abs_diff_plus_log_odds_and_cons_scores

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.154390,-0.517654,-0.198518,1.045550,-0.360801,-0.179535,-0.217770,-0.459057,-0.324826,-0.175673,...,-0.254582,-0.130768,-0.452403,-0.261299,-0.212030,-0.328566,-0.483330,0.156890,-1.045364,0.221493
1,-0.154486,-0.526337,-0.198527,1.097025,-0.372392,-0.179535,-0.219437,-0.458986,-0.324962,-0.175673,...,-0.256540,-0.332558,-0.488405,-0.259493,-0.215981,-0.319342,-0.430473,0.287881,-0.756156,-0.253771
2,-0.154486,-0.528125,-0.198532,-0.595088,-0.373646,-0.179535,-0.220829,-0.459128,-0.324945,-0.175666,...,-0.274748,-0.299737,-0.102483,-0.259153,-0.211673,-0.297017,-0.476723,-2.770889,-0.695235,-0.356084
3,-0.153836,-0.513125,-0.157125,0.776755,-0.373142,-0.178239,-0.220690,-0.450845,-0.304771,-0.175553,...,-0.214739,2.196165,-0.361218,-0.226442,1.717090,-0.153760,-0.430473,0.402149,-0.972393,0.281451
4,-0.154373,-0.490701,-0.197923,-0.629536,-0.373864,-0.179529,-0.220554,-0.459159,-0.324434,-0.175160,...,-0.028447,-0.222698,-0.152905,-0.251612,-0.201743,-0.245226,-0.252079,0.563797,-0.921513,0.323257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61165,-0.136322,-0.522225,-0.169776,-0.661247,-0.359026,-0.152563,-0.191376,-0.457834,-0.282376,-0.147173,...,-0.245971,-0.323873,-0.415494,-0.207852,-0.173248,-0.288908,-0.152972,-1.794034,-1.045364,0.221493
61166,-0.136286,-0.522024,-0.169777,-0.668193,-0.334889,-0.152524,-0.191320,-0.457706,-0.282315,-0.147162,...,-0.245290,-0.191485,-0.198568,-0.182700,-0.177811,-0.235758,0.276493,0.110904,-1.045364,0.221493
61167,-0.136322,-0.520121,-0.169780,-0.664945,-0.356867,-0.152564,-0.191359,-0.457910,-0.282280,-0.147173,...,-0.245743,-0.315652,-0.448360,-0.208909,-0.177915,-0.269982,-0.252079,0.325506,-1.045364,0.221493
61168,-0.136258,1.356259,-0.169359,2.570300,-0.100510,-0.152563,-0.178538,2.360620,-0.154137,-0.147163,...,-0.243760,-0.263651,-0.435328,-0.207928,-0.169643,-0.263580,-0.404044,0.633473,0.019083,-0.873154


In [11]:
import xgboost

In [12]:
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot

In [13]:
X=abs_diff_plus_log_odds_and_cons_scores[0:61170]

In [14]:
X

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.154390,-0.517654,-0.198518,1.045550,-0.360801,-0.179535,-0.217770,-0.459057,-0.324826,-0.175673,...,-0.254582,-0.130768,-0.452403,-0.261299,-0.212030,-0.328566,-0.483330,0.156890,-1.045364,0.221493
1,-0.154486,-0.526337,-0.198527,1.097025,-0.372392,-0.179535,-0.219437,-0.458986,-0.324962,-0.175673,...,-0.256540,-0.332558,-0.488405,-0.259493,-0.215981,-0.319342,-0.430473,0.287881,-0.756156,-0.253771
2,-0.154486,-0.528125,-0.198532,-0.595088,-0.373646,-0.179535,-0.220829,-0.459128,-0.324945,-0.175666,...,-0.274748,-0.299737,-0.102483,-0.259153,-0.211673,-0.297017,-0.476723,-2.770889,-0.695235,-0.356084
3,-0.153836,-0.513125,-0.157125,0.776755,-0.373142,-0.178239,-0.220690,-0.450845,-0.304771,-0.175553,...,-0.214739,2.196165,-0.361218,-0.226442,1.717090,-0.153760,-0.430473,0.402149,-0.972393,0.281451
4,-0.154373,-0.490701,-0.197923,-0.629536,-0.373864,-0.179529,-0.220554,-0.459159,-0.324434,-0.175160,...,-0.028447,-0.222698,-0.152905,-0.251612,-0.201743,-0.245226,-0.252079,0.563797,-0.921513,0.323257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61165,-0.136322,-0.522225,-0.169776,-0.661247,-0.359026,-0.152563,-0.191376,-0.457834,-0.282376,-0.147173,...,-0.245971,-0.323873,-0.415494,-0.207852,-0.173248,-0.288908,-0.152972,-1.794034,-1.045364,0.221493
61166,-0.136286,-0.522024,-0.169777,-0.668193,-0.334889,-0.152524,-0.191320,-0.457706,-0.282315,-0.147162,...,-0.245290,-0.191485,-0.198568,-0.182700,-0.177811,-0.235758,0.276493,0.110904,-1.045364,0.221493
61167,-0.136322,-0.520121,-0.169780,-0.664945,-0.356867,-0.152564,-0.191359,-0.457910,-0.282280,-0.147173,...,-0.245743,-0.315652,-0.448360,-0.208909,-0.177915,-0.269982,-0.252079,0.325506,-1.045364,0.221493
61168,-0.136258,1.356259,-0.169359,2.570300,-0.100510,-0.152563,-0.178538,2.360620,-0.154137,-0.147163,...,-0.243760,-0.263651,-0.435328,-0.207928,-0.169643,-0.263580,-0.404044,0.633473,0.019083,-0.873154


In [15]:
col_lab = list(range(0,1384))
X = X.set_axis(col_lab, axis="columns")

In [16]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.154390,-0.517654,-0.198518,1.045550,-0.360801,-0.179535,-0.217770,-0.459057,-0.324826,-0.175673,...,-0.254582,-0.130768,-0.452403,-0.261299,-0.212030,-0.328566,-0.483330,0.156890,-1.045364,0.221493
1,-0.154486,-0.526337,-0.198527,1.097025,-0.372392,-0.179535,-0.219437,-0.458986,-0.324962,-0.175673,...,-0.256540,-0.332558,-0.488405,-0.259493,-0.215981,-0.319342,-0.430473,0.287881,-0.756156,-0.253771
2,-0.154486,-0.528125,-0.198532,-0.595088,-0.373646,-0.179535,-0.220829,-0.459128,-0.324945,-0.175666,...,-0.274748,-0.299737,-0.102483,-0.259153,-0.211673,-0.297017,-0.476723,-2.770889,-0.695235,-0.356084
3,-0.153836,-0.513125,-0.157125,0.776755,-0.373142,-0.178239,-0.220690,-0.450845,-0.304771,-0.175553,...,-0.214739,2.196165,-0.361218,-0.226442,1.717090,-0.153760,-0.430473,0.402149,-0.972393,0.281451
4,-0.154373,-0.490701,-0.197923,-0.629536,-0.373864,-0.179529,-0.220554,-0.459159,-0.324434,-0.175160,...,-0.028447,-0.222698,-0.152905,-0.251612,-0.201743,-0.245226,-0.252079,0.563797,-0.921513,0.323257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61165,-0.136322,-0.522225,-0.169776,-0.661247,-0.359026,-0.152563,-0.191376,-0.457834,-0.282376,-0.147173,...,-0.245971,-0.323873,-0.415494,-0.207852,-0.173248,-0.288908,-0.152972,-1.794034,-1.045364,0.221493
61166,-0.136286,-0.522024,-0.169777,-0.668193,-0.334889,-0.152524,-0.191320,-0.457706,-0.282315,-0.147162,...,-0.245290,-0.191485,-0.198568,-0.182700,-0.177811,-0.235758,0.276493,0.110904,-1.045364,0.221493
61167,-0.136322,-0.520121,-0.169780,-0.664945,-0.356867,-0.152564,-0.191359,-0.457910,-0.282280,-0.147173,...,-0.245743,-0.315652,-0.448360,-0.208909,-0.177915,-0.269982,-0.252079,0.325506,-1.045364,0.221493
61168,-0.136258,1.356259,-0.169359,2.570300,-0.100510,-0.152563,-0.178538,2.360620,-0.154137,-0.147163,...,-0.243760,-0.263651,-0.435328,-0.207928,-0.169643,-0.263580,-0.404044,0.633473,0.019083,-0.873154


In [17]:
import pandas as pd

snp_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/REG_dataset_XY-named.csv", sep=',')


In [18]:
y = snp_prvcs['label'][0:61170]

In [19]:
y

0        1
1        1
2        1
3        1
4        1
        ..
61165    0
61166    0
61167    0
61168    0
61169    0
Name: label, Length: 61170, dtype: int64

In [20]:
y[5240:5250]

5240    1
5241    1
5242    1
5243    1
5244    1
5245    1
5246    1
5247    0
5248    0
5249    0
Name: label, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
test_size = 0.3
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [22]:
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

model1 = XGBClassifier() 
model2 = XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5) 
train_model1 = model1.fit(X_train, y_train) 
train_model2 = model2.fit(X_train, y_train) 
pred1 = train_model1.predict(X_test) 
pred2 = train_model2.predict(X_test) 
print("Accuracy_model1: %.4f" % (accuracy_score(y_test, pred1))) 
print("Accuracy_model2: %.4f" % (accuracy_score(y_test, pred2))) 
print("F1_model1: %.4f" % (f1_score(y_test, pred1)))
print("F1_model2: %.4f" % (f1_score(y_test, pred2)))
print("MCC_model1: %.4f" % (matthews_corrcoef(y_test, pred1)))
print("MCC_model2: %.4f" % (matthews_corrcoef(y_test, pred2)))
print("ROC_AUC_model1: %.4f" % (roc_auc_score(y_test, pred1))) # don't know about call: y_score vs. pred
print("ROC_AUC_model2: %.4f" % (roc_auc_score(y_test, pred2))) # don't know about call: y_score vs. pred


Accuracy_model1: 0.9417
Accuracy_model2: 0.9416
F1_model1: 0.6586
F1_model2: 0.6610
MCC_model1: 0.6267
MCC_model2: 0.6291
ROC_AUC_model1: 0.8149
ROC_AUC_model2: 0.8189


In [23]:
pred2[0:61170].sum()

1605

In [24]:
model3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=7)

train_model3 = model3.fit(X_train, y_train)
pred3 = train_model3.predict(X_test)
print("Accuracy_model3: %.4f" % (accuracy_score(y_test, pred3)))
print("F1_model3: %.4f" % (f1_score(y_test, pred3)))
print("MCC_model3: %.4f" % (matthews_corrcoef(y_test, pred3)))
print("ROC_AUC_model3: %.4f" % (roc_auc_score(y_test, pred3))) # don't know about call: y_score vs. pred

Accuracy_model3: 0.9440
F1_model3: 0.6765
MCC_model3: 0.6461
ROC_AUC_model3: 0.8287


In [25]:
from sklearn.model_selection import GridSearchCV

param_test = {
 'max_depth':[3],
 'min_child_weight':[2],
 'learning_rate':[0.1],
 'n_estimators':[50, 75],
 'objective':['binary:logistic']
 #'objective':['binary:logistic','binary:hinge','binary:logitraw']
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(gamma=0, 
                                               subsample=0.8, 
                                               colsample_bytree=0.8, 
                                               nthread=4, 
                                               scale_pos_weight=1, 
                                               seed=7), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=5,
                       refit=True)

train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_test)
print("Accuracy_model4: %.4f" % (accuracy_score(y_test, pred4)))  
print("F1_model4: %.4f" % (f1_score(y_test, pred4)))  
print("MCC_model4: %.4f" % (matthews_corrcoef(y_test, pred4)))  
print("ROC_AUC_model4: %.4f" % (roc_auc_score(y_test, pred4)))  

Fitting 5 folds for each of 2 candidates, totalling 10 fits




Accuracy_model4: 0.9457
F1_model4: 0.6838
MCC_model4: 0.6541
ROC_AUC_model4: 0.8308


In [26]:
pred4.sum()

1596

In [27]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 2,
 'n_estimators': 75,
 'objective': 'binary:logistic'}

In [28]:
model_deepsea1 = XGBClassifier(
 reg_alpha = 20,
 reg_lambda = 2000,
 eta = 0.1,
 n_estimators=10,
 objective= 'binary:logistic',
 seed=7)

train_model_deepsea1 = model_deepsea1.fit(X_train, y_train)
pred_deepsea1 = train_model_deepsea1.predict(X_test)
print("Accuracy_model_deepsea1: %.4f" % (accuracy_score(y_test, pred_deepsea1)))
print("F1_model_deepsea1: %.4f" % (f1_score(y_test, pred_deepsea1)))
print("MCC_model_deepsea1: %.4f" % (matthews_corrcoef(y_test, pred_deepsea1)))
print("ROC_AUC_model_deepsea1: %.4f" % (roc_auc_score(y_test, pred_deepsea1))) # don't know about call: y_score vs. pred

Accuracy_model_deepsea1: 0.9152
F1_model_deepsea1: 0.0000
MCC_model_deepsea1: 0.0000
ROC_AUC_model_deepsea1: 0.5000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [29]:
model_deepsea2 = XGBClassifier(
 reg_alpha = 0,
 reg_lambda = 10,
 eta = 0.1,
 n_estimators=100,
 objective= 'binary:logistic',
 seed=7)

train_model_deepsea2 = model_deepsea2.fit(X_train, y_train)
pred_deepsea2 = train_model_deepsea2.predict(X_test)
print("Accuracy_model_deepsea1: %.4f" % (accuracy_score(y_test, pred_deepsea2)))
print("F1_model_deepsea2: %.4f" % (f1_score(y_test, pred_deepsea2)))
print("MCC_model_deepsea2: %.4f" % (matthews_corrcoef(y_test, pred_deepsea2)))
print("ROC_AUC_model_deepsea2: %.4f" % (roc_auc_score(y_test, pred_deepsea2))) # don't know about call: y_score vs. pred

Accuracy_model_deepsea1: 0.9429
F1_model_deepsea2: 0.6709
MCC_model_deepsea2: 0.6398
ROC_AUC_model_deepsea2: 0.8261


In [30]:
model_regbase = XGBClassifier(
    colsample_bylevel = 0.6,
    colsample_bytree = 1.0,
    gamma = 5.0,
    learning_rate = 0.1,
    max_depth = 4,
    min_child_weight = 4,
    n_estimators = 300,
    reg_alpha = 5.0,
    reg_lambda = 0.01,
    scale_pos_weight = 10.658093373,
    subsample = 1.0)

train_model_regbase = model_regbase.fit(X_train, y_train)
pred_regbase = train_model_regbase.predict(X_test)
print("Accuracy_model_regbase: %.4f" % (accuracy_score(y_test, pred_regbase)))
print("F1_model_regbase: %.4f" % (f1_score(y_test, pred_regbase)))
print("MCC_model_regbase: %.4f" % (matthews_corrcoef(y_test, pred_regbase)))
print("ROC_AUC_model_regbase: %.4f" % (roc_auc_score(y_test, pred_regbase)))

Accuracy_model_regbase: 0.9316
F1_model_regbase: 0.7067
MCC_model_regbase: 0.7049
ROC_AUC_model_regbase: 0.9498


In [31]:
pred_regbase.sum()

2725

In [32]:
import xgboost as xgb

data_matrix = xgb.DMatrix(data=X, label=y)

params = {
    "colsample_bylevel": 0.6,
    "colsample_bytree": 1.0,
    "gamma": 5.0,
    "learning_rate": 0.1,
    "max_depth": 4,
    "min_child_weight": 4,
    "alpha": 5.0,
    "lambda": 0.01,
    "scale_pos_weight": 10.658093373,
    "subsample": 1.0
}

xgb_cv = xgb.cv(dtrain=data_matrix, params=params, nfold=10,
                    num_boost_round=100, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=7)


In [33]:
xgb_cv

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.970965,0.000604,0.969913,0.002578
1,0.972889,0.000462,0.97187,0.002291
2,0.974447,0.000438,0.973205,0.002241
3,0.974987,0.00038,0.973549,0.002202
4,0.975249,0.000292,0.973789,0.002194
5,0.97551,0.000358,0.973934,0.002022
6,0.976087,0.000249,0.974543,0.001919
7,0.976266,0.000258,0.974674,0.001961
8,0.976468,0.000217,0.974797,0.001978
9,0.976693,0.000256,0.974945,0.001937


In [34]:
model_regbase.set_params(n_estimators=xgb_cv.shape[0])



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.6, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=5.0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=20, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [35]:
train_model_regbase_cv = model_regbase.fit(X_train,y_train)
pred_regbase_cv = train_model_regbase.predict(X_test)
print("Accuracy_model_regbase_cv: %.4f" % (accuracy_score(y_test, pred_regbase_cv)))
print("F1_model_regbase_cv: %.4f" % (f1_score(y_test, pred_regbase_cv)))
print("MCC_model_regbase_cv: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv)))
print("ROC_AUC_model_regbase_cv: %.4f" % (roc_auc_score(y_test, pred_regbase_cv)))

Accuracy_model_regbase_cv: 0.9230
F1_model_regbase_cv: 0.6879
MCC_model_regbase_cv: 0.6929
ROC_AUC_model_regbase_cv: 0.9579


In [36]:
pred_regbase_cv.sum()

2970

In [None]:
param_test = {
 'max_depth': range(2,8,2),
 'min_child_weight': range(2,8,2)
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(colsample_bylevel = 0.6,
                                                colsample_bytree = 1.0,
                                                gamma = 5.0,
                                                learning_rate = 0.1,
                                                max_depth = 4,
                                                min_child_weight = 4,
                                                n_estimators = 20,
                                                objective = 'binary:logistic',
                                                reg_alpha = 5.0,
                                                reg_lambda = 0.01,
                                                scale_pos_weight = 10.658093373,
                                                subsample = 1.0), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=5,
                       refit=True)

train_model_regbase_cv2 = gsearch.fit(X_train, y_train)
pred_regbase_cv2 = train_model_regbase_cv2.predict(X_test)
print("Accuracy_model_cv2: %.4f" % (accuracy_score(y_test, pred_regbase_cv2)))  
print("F1_model_cv2: %.4f" % (f1_score(y_test, pred_regbase_cv2)))  
print("MCC_model_cv2: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv2)))  
print("ROC_AUC_model_cv2: %.4f" % (roc_auc_score(y_test, pred_regbase_cv2)))  

In [None]:
gsearch.cv_results_['params'][gsearch.best_index_]

In [None]:
param_test = {
 'max_depth': [3,4,5],
 'min_child_weight': [3,4,5]
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(colsample_bylevel = 0.6,
                                                colsample_bytree = 1.0,
                                                gamma = 5.0,
                                                learning_rate = 0.1,
                                                max_depth = 4,
                                                min_child_weight = 4,
                                                n_estimators = 20,
                                                objective = 'binary:logistic',
                                                reg_alpha = 5.0,
                                                reg_lambda = 0.01,
                                                scale_pos_weight = 10.658093373,
                                                subsample = 1.0), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv3 = gsearch.fit(X_train, y_train)
pred_regbase_cv3 = train_model_regbase_cv3.predict(X_test)
print("Accuracy_model_cv3: %.4f" % (accuracy_score(y_test, pred_regbase_cv3)))  
print("F1_model_cv3: %.4f" % (f1_score(y_test, pred_regbase_cv3)))  
print("MCC_model_cv3: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv3)))  
print("ROC_AUC_model_cv3: %.4f" % (roc_auc_score(y_test, pred_regbase_cv3)))  

In [None]:
gsearch.cv_results_['params'][gsearch.best_index_]

In [None]:
pd.DataFrame(gsearch.cv_results_)

In [None]:
param_test = {
 'gamma': range(2,6),
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(colsample_bylevel = 0.6,
                                                colsample_bytree = 1.0,
                                                gamma = 5.0,
                                                learning_rate = 0.1,
                                                max_depth = 4,
                                                min_child_weight = 5,
                                                n_estimators = 20,
                                                objective = 'binary:logistic',
                                                reg_alpha = 5.0,
                                                reg_lambda = 0.01,
                                                scale_pos_weight = 10.658093373,
                                                subsample = 1.0), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv4 = gsearch.fit(X_train, y_train)
pred_regbase_cv4 = train_model_regbase_cv4.predict(X_test)
print("Accuracy_model_cv4: %.4f" % (accuracy_score(y_test, pred_regbase_cv4)))  
print("F1_model_cv4: %.4f" % (f1_score(y_test, pred_regbase_cv4)))  
print("MCC_model_cv4: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv4)))  
print("ROC_AUC_model_cv4: %.4f" % (roc_auc_score(y_test, pred_regbase_cv4)))  

In [None]:
gsearch.cv_results_['params'][gsearch.best_index_]

In [None]:
pd.DataFrame(gsearch.cv_results_)

In [None]:
param_test = {
 'subsample': [i/10.0 for i in range(6,10)],
 'colsample_bytree': [i/10.0 for i in range(6,10)]
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(colsample_bylevel = 0.6,
                                                colsample_bytree = 1.0,
                                                gamma = 3.0,
                                                learning_rate = 0.1,
                                                max_depth = 4,
                                                min_child_weight = 5,
                                                n_estimators = 20,
                                                objective = 'binary:logistic',
                                                reg_alpha = 5.0,
                                                reg_lambda = 0.01,
                                                scale_pos_weight = 10.658093373,
                                                subsample = 1.0), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv5 = gsearch.fit(X_train, y_train)
pred_regbase_cv5 = train_model_regbase_cv5.predict(X_test)
print("Accuracy_model_cv5: %.4f" % (accuracy_score(y_test, pred_regbase_cv5)))  
print("F1_model_cv5: %.4f" % (f1_score(y_test, pred_regbase_cv5)))  
print("MCC_model_cv5: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv5)))  
print("ROC_AUC_model_cv5: %.4f" % (roc_auc_score(y_test, pred_regbase_cv5)))  

In [None]:
gsearch.cv_results_['params'][gsearch.best_index_], gsearch.best_score_

In [None]:
pd.DataFrame(gsearch.cv_results_)

In [None]:
param_test = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 5, 100]
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(colsample_bylevel = 0.6,
                                                colsample_bytree = 1.0,
                                                gamma = 3.0,
                                                learning_rate = 0.1,
                                                max_depth = 4,
                                                min_child_weight = 5,
                                                n_estimators = 24,
                                                objective = 'binary:logistic',
                                                reg_alpha = 5.0,
                                                reg_lambda = 0.01,
                                                scale_pos_weight = 10.658093373,
                                                subsample = 1.0), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv6 = gsearch.fit(X_train, y_train)
pred_regbase_cv6 = train_model_regbase_cv6.predict(X_test)
print("Accuracy_model_cv6: %.4f" % (accuracy_score(y_test, pred_regbase_cv6)))  
print("F1_model_cv6: %.4f" % (f1_score(y_test, pred_regbase_cv6)))  
print("MCC_model_cv6: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv6)))  
print("ROC_AUC_model_cv6: %.4f" % (roc_auc_score(y_test, pred_regbase_cv6)))  

In [None]:
gsearch.cv_results_['params'][gsearch.best_index_], gsearch.best_score_

In [None]:
pd.DataFrame(gsearch.cv_results_)

In [None]:
# Save default model

In [26]:
from joblib import dump, load

best_estimator = train_model1
dump(best_estimator, path_prefix + "DNABERT_2/Output_Models/" + "model_xpg_boost_PRVCS_REG_CADD_cons_default.joblib")

['/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_REG_CADD_cons_default.joblib']

In [None]:
# Save best model

In [None]:
from joblib import dump, load

best_estimator = train_model_regbase_cv
dump(best_estimator, path_prefix + "DNABERT_2/Output_Models/" + "model_xpg_boost_PRVCS_REG_CADD_cons.joblib")

In [37]:
from joblib import dump, load

test_estimator = load('/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_REG_CADD_cons.joblib')

In [38]:
test_pred = test_estimator.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

print("Accuracy_best: %.4f" % (accuracy_score(y_test, test_pred)))
print("F1_best: %.4f" % (f1_score(y_test, test_pred)))
print("MCC_best: %.4f" % (matthews_corrcoef(y_test, test_pred)))
print("ROC_best: %.4f" % (roc_auc_score(y_test, test_pred)))

Accuracy_best: 0.9230
F1_best: 0.6879
MCC_best: 0.6929
ROC_best: 0.9579


In [40]:
test_pred.sum()

2970

In [41]:
# Load Rate_Patho dataset

In [42]:
import pandas as pd

path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

RARPAT_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/RARPAT_0_102-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [43]:
RARPAT_abs_diff_plus_log_odds_scores_norm.shape

(102, 1380)

Import conservation scores

In [44]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Rare_Patho_SNV_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(         Name  priPhCons  priPhyloP  GerpN  GerpS
 0  RARPAT0001      0.001     -3.552   4.26 -6.250
 1  RARPAT0002      0.021     -1.482   4.36 -3.380
 2  RARPAT0003      0.000     -3.401   4.09 -3.280
 3  RARPAT0004      0.097      0.255   2.12  0.856
 4  RARPAT0005      0.069     -0.558   2.47 -4.950,
 (102, 5))

In [45]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,RARPAT0001,0.001,-3.552,4.26,-6.2500
1,RARPAT0002,0.021,-1.482,4.36,-3.3800
2,RARPAT0003,0.000,-3.401,4.09,-3.2800
3,RARPAT0004,0.097,0.255,2.12,0.8560
4,RARPAT0005,0.069,-0.558,2.47,-4.9500
...,...,...,...,...,...
97,RARPAT0098,0.956,0.597,5.66,5.6600
98,RARPAT0099,0.916,0.597,5.14,0.0847
99,RARPAT0100,0.029,0.484,4.10,0.2040
100,RARPAT0101,0.001,-0.255,4.26,-4.4600


Drop name column before concatenating

In [46]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [47]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.001,-3.552,4.26,-6.2500
1,0.021,-1.482,4.36,-3.3800
2,0.000,-3.401,4.09,-3.2800
3,0.097,0.255,2.12,0.8560
4,0.069,-0.558,2.47,-4.9500
...,...,...,...,...
97,0.956,0.597,5.66,5.6600
98,0.916,0.597,5.14,0.0847
99,0.029,0.484,4.10,0.2040
100,0.001,-0.255,4.26,-4.4600


Normalize conservation scores

In [48]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [49]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.804896,-5.978133,1.105324,-3.074966
1,-0.702284,-2.482164,1.180027,-1.709069
2,-0.810026,-5.723113,0.978329,-1.661477
3,-0.312361,0.451409,-0.493317,0.306939
4,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...
97,4.094798,1.029004,2.151165,2.593270
98,3.889575,1.029004,1.762710,-0.060140
99,-0.661240,0.838162,0.985800,-0.003363
100,-0.804896,-0.409916,1.105324,-2.223065


Concatenate chromatin feature scores and conservation scores

In [50]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([RARPAT_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [51]:
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.206538,2.906377,-0.170420,0.426654,4.482777,-0.250790,-0.309892,-0.490882,1.445626,-0.223756,...,-0.345971,-0.380637,-0.121216,-0.384348,-0.293211,-0.380950,-0.804896,-5.978133,1.105324,-3.074966
1,-0.199992,-0.375900,-0.181145,0.090514,-0.511342,-0.245346,-0.315248,-0.518109,-0.447039,-0.203248,...,0.005478,-0.285432,-0.564576,-0.375864,-0.276490,-0.429664,-0.702284,-2.482164,1.180027,-1.709069
2,-0.206664,-0.596656,0.064562,-0.834445,-0.511587,-0.200498,-0.315569,-0.146835,-0.445965,-0.220517,...,0.588175,0.127299,-0.620407,-0.369813,1.355054,0.490738,-0.810026,-5.723113,0.978329,-1.661477
3,-0.206959,-0.652848,-0.199506,-1.051992,-0.511260,-0.250757,-0.315690,-0.531651,-0.447218,-0.225224,...,-0.414731,-0.393139,-0.635052,-0.382570,-0.302146,-0.436147,-0.312361,0.451409,-0.493317,0.306939
4,-0.206414,-0.607214,-0.142333,-0.135244,-0.511261,-0.250779,-0.314809,-0.530276,-0.366058,-0.225169,...,-0.283245,-0.315174,1.199907,-0.343330,-0.283350,-0.238857,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.358051,1.170373,0.535112,0.557339,-0.139149,-0.221604,-0.294488,-0.465753,-0.089202,-0.127893,...,-0.301338,-0.008619,-0.463245,0.107149,-0.249296,-0.181547,4.094798,1.029004,2.151165,2.593270
98,0.395013,3.306250,0.176883,0.839464,0.098059,-0.159920,-0.292982,-0.471717,0.924219,-0.163577,...,-0.416086,-0.071533,-0.495524,-0.187121,-0.251338,-0.333898,3.889575,1.029004,1.762710,-0.060140
99,1.713206,-0.532519,0.096789,0.765235,2.301548,-0.026938,-0.315642,-0.385426,-0.442458,0.458197,...,0.099295,0.810668,-0.066553,-0.376010,0.247688,0.248135,-0.661240,0.838162,0.985800,-0.003363
100,-0.206210,0.352594,-0.198994,-0.541340,-0.498336,-0.250799,1.820070,-0.310376,0.780945,-0.225134,...,-0.071771,-0.149783,-0.630602,-0.381858,-0.303848,-0.410570,-0.804896,-0.409916,1.105324,-2.223065


In [52]:
# Set model names to numbers

In [53]:
col_lab = list(range(0,1384))
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm = RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [54]:
RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.206538,2.906377,-0.170420,0.426654,4.482777,-0.250790,-0.309892,-0.490882,1.445626,-0.223756,...,-0.345971,-0.380637,-0.121216,-0.384348,-0.293211,-0.380950,-0.804896,-5.978133,1.105324,-3.074966
1,-0.199992,-0.375900,-0.181145,0.090514,-0.511342,-0.245346,-0.315248,-0.518109,-0.447039,-0.203248,...,0.005478,-0.285432,-0.564576,-0.375864,-0.276490,-0.429664,-0.702284,-2.482164,1.180027,-1.709069
2,-0.206664,-0.596656,0.064562,-0.834445,-0.511587,-0.200498,-0.315569,-0.146835,-0.445965,-0.220517,...,0.588175,0.127299,-0.620407,-0.369813,1.355054,0.490738,-0.810026,-5.723113,0.978329,-1.661477
3,-0.206959,-0.652848,-0.199506,-1.051992,-0.511260,-0.250757,-0.315690,-0.531651,-0.447218,-0.225224,...,-0.414731,-0.393139,-0.635052,-0.382570,-0.302146,-0.436147,-0.312361,0.451409,-0.493317,0.306939
4,-0.206414,-0.607214,-0.142333,-0.135244,-0.511261,-0.250779,-0.314809,-0.530276,-0.366058,-0.225169,...,-0.283245,-0.315174,1.199907,-0.343330,-0.283350,-0.238857,-0.456017,-0.921645,-0.231857,-2.456267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.358051,1.170373,0.535112,0.557339,-0.139149,-0.221604,-0.294488,-0.465753,-0.089202,-0.127893,...,-0.301338,-0.008619,-0.463245,0.107149,-0.249296,-0.181547,4.094798,1.029004,2.151165,2.593270
98,0.395013,3.306250,0.176883,0.839464,0.098059,-0.159920,-0.292982,-0.471717,0.924219,-0.163577,...,-0.416086,-0.071533,-0.495524,-0.187121,-0.251338,-0.333898,3.889575,1.029004,1.762710,-0.060140
99,1.713206,-0.532519,0.096789,0.765235,2.301548,-0.026938,-0.315642,-0.385426,-0.442458,0.458197,...,0.099295,0.810668,-0.066553,-0.376010,0.247688,0.248135,-0.661240,0.838162,0.985800,-0.003363
100,-0.206210,0.352594,-0.198994,-0.541340,-0.498336,-0.250799,1.820070,-0.310376,0.780945,-0.225134,...,-0.071771,-0.149783,-0.630602,-0.381858,-0.303848,-0.410570,-0.804896,-0.409916,1.105324,-2.223065


In [55]:
RARPAT_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Rare_Patho_SNV_dataset_XY-named.txt", sep='\t')

In [56]:
y = RARPAT_prvcs['label']

In [57]:
y

0      0
1      0
2      0
3      0
4      0
      ..
97     1
98     1
99     1
100    0
101    1
Name: label, Length: 102, dtype: int64

In [58]:
RARPAT_pred = test_estimator.predict(RARPAT_abs_diff_plus_log_odds_and_cons_scores_norm)

In [59]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, RARPAT_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, RARPAT_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, RARPAT_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , RARPAT_pred)))

Accuracy_model_REG: 0.4314
F1_model_REG: 0.6027
MCC_model_REG: 0.0000
ROC_AUC_model_REG: 0.5000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Load ASD dataset

In [60]:
ASD_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/ASD_0_107-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

Import conservation scores

In [61]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/ASD_denovo_SNV_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(      Name  priPhCons  priPhyloP  GerpN  GerpS
 0  ASD0001      0.017     -0.273  3.980   3.07
 1  ASD0002      0.115     -0.033  1.909  -0.20
 2  ASD0003      0.115     -0.033  1.909  -0.20
 3  ASD0004      0.115     -0.033  1.909  -0.20
 4  ASD0005      0.115     -0.033  1.909  -0.20,
 (107, 5))

In [62]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,ASD0001,0.017,-0.273,3.980,3.070
1,ASD0002,0.115,-0.033,1.909,-0.200
2,ASD0003,0.115,-0.033,1.909,-0.200
3,ASD0004,0.115,-0.033,1.909,-0.200
4,ASD0005,0.115,-0.033,1.909,-0.200
...,...,...,...,...,...
102,ASD0103,0.028,0.473,1.410,-1.150
103,ASD0104,0.115,-0.033,1.909,-0.200
104,ASD0105,0.044,-1.067,2.880,0.648
105,ASD0106,0.054,-0.467,3.360,1.160


Drop name column before concatenating

In [63]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [64]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.017,-0.273,3.980,3.070
1,0.115,-0.033,1.909,-0.200
2,0.115,-0.033,1.909,-0.200
3,0.115,-0.033,1.909,-0.200
4,0.115,-0.033,1.909,-0.200
...,...,...,...,...
102,0.028,0.473,1.410,-1.150
103,0.115,-0.033,1.909,-0.200
104,0.044,-1.067,2.880,0.648
105,0.054,-0.467,3.360,1.160


Normalize conservation scores

In [65]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [66]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.684033,-0.698189,1.873736,2.502629
1,-0.103806,0.033836,-0.376443,0.121347
2,-0.103806,0.033836,-0.376443,0.121347
3,-0.103806,0.033836,-0.376443,0.121347
4,-0.103806,0.033836,-0.376443,0.121347
...,...,...,...,...
102,-0.618905,1.577190,-0.918615,-0.570463
103,-0.103806,0.033836,-0.376443,0.121347
104,-0.524174,-3.119974,0.678566,0.738878
105,-0.464968,-1.289910,1.200095,1.111727


Concatenate chromatin feature scores and conservation scores

In [67]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
ASD_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([ASD_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [68]:
ASD_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.229695,-0.602858,-0.290626,-0.089052,-0.163430,-0.288711,-0.374899,-0.487998,-0.477156,-0.318228,...,-0.545594,-0.354866,-0.323475,-0.450354,-0.393204,-0.429022,-0.684033,-0.698189,1.873736,2.502629
1,-0.229542,-0.607343,-0.285806,0.696796,-0.466199,-0.288711,-0.372011,-0.445119,-0.462745,-0.319469,...,-0.535811,-0.428614,-0.507799,-0.450341,-0.392362,-0.331279,-0.103806,0.033836,-0.376443,0.121347
2,-0.229701,-0.664080,-0.290600,-0.174744,-0.453984,-0.288711,-0.370807,-0.534545,-0.476378,-0.319586,...,-0.543638,-0.424393,-0.374857,-0.436132,-0.388976,-0.426025,-0.103806,0.033836,-0.376443,0.121347
3,-0.229476,-0.584708,-0.290655,-0.789959,-0.462516,-0.288706,-0.362875,-0.532215,-0.300705,-0.318731,...,-0.475389,-0.288862,1.355761,-0.446610,-0.392134,-0.398144,-0.103806,0.033836,-0.376443,0.121347
4,-0.225621,1.027578,-0.278320,-0.599621,0.840352,-0.288705,-0.152654,-0.550221,-0.469771,-0.319039,...,-0.556688,-0.389388,-0.598057,-0.362129,-0.394222,-0.415341,-0.103806,0.033836,-0.376443,0.121347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.214256,-0.687476,-0.016445,-0.741008,-0.467378,-0.073159,-0.376444,-0.554693,-0.475152,-0.310869,...,-0.394164,-0.244371,-0.620546,-0.432624,-0.283490,-0.293363,-0.618905,1.577190,-0.918615,-0.570463
103,-0.208566,-0.606097,0.049127,-0.549999,-0.467226,-0.140918,-0.224215,0.030520,-0.138759,-0.291726,...,0.079465,-0.430403,-0.487613,-0.384336,-0.120339,1.260034,-0.103806,0.033836,-0.376443,0.121347
104,-0.229132,0.378171,-0.278365,-0.783417,-0.409220,-0.283918,-0.325528,-0.119599,-0.337254,-0.318836,...,-0.359191,-0.269229,-0.161170,-0.440565,0.254996,-0.370670,-0.524174,-3.119974,0.678566,0.738878
105,-0.225772,-0.358270,-0.251520,2.095907,-0.113983,-0.288496,0.223446,0.162861,0.770044,-0.319727,...,0.025542,-0.288175,0.650125,1.904538,-0.381119,-0.345916,-0.464968,-1.289910,1.200095,1.111727


In [69]:
# Set model names to numbers

In [70]:
col_lab = list(range(0,1384))
ASD_abs_diff_plus_log_odds_and_cons_scores_norm = ASD_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [71]:
ASD_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.229695,-0.602858,-0.290626,-0.089052,-0.163430,-0.288711,-0.374899,-0.487998,-0.477156,-0.318228,...,-0.545594,-0.354866,-0.323475,-0.450354,-0.393204,-0.429022,-0.684033,-0.698189,1.873736,2.502629
1,-0.229542,-0.607343,-0.285806,0.696796,-0.466199,-0.288711,-0.372011,-0.445119,-0.462745,-0.319469,...,-0.535811,-0.428614,-0.507799,-0.450341,-0.392362,-0.331279,-0.103806,0.033836,-0.376443,0.121347
2,-0.229701,-0.664080,-0.290600,-0.174744,-0.453984,-0.288711,-0.370807,-0.534545,-0.476378,-0.319586,...,-0.543638,-0.424393,-0.374857,-0.436132,-0.388976,-0.426025,-0.103806,0.033836,-0.376443,0.121347
3,-0.229476,-0.584708,-0.290655,-0.789959,-0.462516,-0.288706,-0.362875,-0.532215,-0.300705,-0.318731,...,-0.475389,-0.288862,1.355761,-0.446610,-0.392134,-0.398144,-0.103806,0.033836,-0.376443,0.121347
4,-0.225621,1.027578,-0.278320,-0.599621,0.840352,-0.288705,-0.152654,-0.550221,-0.469771,-0.319039,...,-0.556688,-0.389388,-0.598057,-0.362129,-0.394222,-0.415341,-0.103806,0.033836,-0.376443,0.121347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.214256,-0.687476,-0.016445,-0.741008,-0.467378,-0.073159,-0.376444,-0.554693,-0.475152,-0.310869,...,-0.394164,-0.244371,-0.620546,-0.432624,-0.283490,-0.293363,-0.618905,1.577190,-0.918615,-0.570463
103,-0.208566,-0.606097,0.049127,-0.549999,-0.467226,-0.140918,-0.224215,0.030520,-0.138759,-0.291726,...,0.079465,-0.430403,-0.487613,-0.384336,-0.120339,1.260034,-0.103806,0.033836,-0.376443,0.121347
104,-0.229132,0.378171,-0.278365,-0.783417,-0.409220,-0.283918,-0.325528,-0.119599,-0.337254,-0.318836,...,-0.359191,-0.269229,-0.161170,-0.440565,0.254996,-0.370670,-0.524174,-3.119974,0.678566,0.738878
105,-0.225772,-0.358270,-0.251520,2.095907,-0.113983,-0.288496,0.223446,0.162861,0.770044,-0.319727,...,0.025542,-0.288175,0.650125,1.904538,-0.381119,-0.345916,-0.464968,-1.289910,1.200095,1.111727


In [72]:
ASD_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/ASD_denovo_SNV_dataset_XY-named.txt", sep='\t')

In [73]:
y = ASD_prvcs['label']

In [74]:
ASD_pred = test_estimator.predict(ASD_abs_diff_plus_log_odds_and_cons_scores_norm)

In [75]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, ASD_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, ASD_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, ASD_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , ASD_pred)))

Accuracy_model_REG: 0.5327
F1_model_REG: 0.6951
MCC_model_REG: 0.0000
ROC_AUC_model_REG: 0.5000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Load GTEx dataset

In [76]:
GTEX_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/GTEX_0_796-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [77]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GTEx_eQTL_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(       Name  priPhCons  priPhyloP  GerpN  GerpS
 0  GTEX0001      0.007      0.375  1.150  1.150
 1  GTEX0002      0.115     -0.033  1.909 -0.200
 2  GTEX0003      0.115     -0.033  1.909 -0.200
 3  GTEX0004      0.000     -0.600  1.960 -0.495
 4  GTEX0005      0.026      0.387  1.250 -1.360,
 (796, 5))

In [78]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,GTEX0001,0.007,0.375,1.150,1.150
1,GTEX0002,0.115,-0.033,1.909,-0.200
2,GTEX0003,0.115,-0.033,1.909,-0.200
3,GTEX0004,0.000,-0.600,1.960,-0.495
4,GTEX0005,0.026,0.387,1.250,-1.360
...,...,...,...,...,...
791,GTEX0792,0.115,-0.033,1.909,-0.200
792,GTEX0793,0.034,0.146,0.149,0.149
793,GTEX0794,0.115,-0.033,1.909,-0.200
794,GTEX0795,0.115,-0.033,1.909,-0.200


Drop name column before concatenating

In [79]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [80]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.007,0.375,1.150,1.150
1,0.115,-0.033,1.909,-0.200
2,0.115,-0.033,1.909,-0.200
3,0.000,-0.600,1.960,-0.495
4,0.026,0.387,1.250,-1.360
...,...,...,...,...
791,0.115,-0.033,1.909,-0.200
792,0.034,0.146,0.149,0.149
793,0.115,-0.033,1.909,-0.200
794,0.115,-0.033,1.909,-0.200


Normalize conservation scores

In [81]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [82]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.909252,1.207975,-0.986787,1.188003
1,0.007110,0.063071,-0.059739,0.073786
2,0.007110,0.063071,-0.059739,0.073786
3,-0.968646,-1.528009,0.002553,-0.169691
4,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...
791,0.007110,0.063071,-0.059739,0.073786
792,-0.680161,0.565370,-2.209417,0.361832
793,0.007110,0.063071,-0.059739,0.073786
794,0.007110,0.063071,-0.059739,0.073786


Concatenate chromatin feature scores and conservation scores

In [83]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([GTEX_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [84]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,-0.178355,-0.485832,-0.224357,-0.385733,-0.429759,-0.209388,-0.244968,-0.468064,-0.360461,-0.188559,...,-0.289796,-0.205919,-0.389517,-0.261726,-0.260624,-0.351443,-0.909252,1.207975,-0.986787,1.188003
1,-0.178352,-0.246669,-0.221942,0.956891,3.155623,-0.209369,-0.240326,0.142329,2.235803,-0.153395,...,-0.169110,0.110825,0.377795,-0.217207,-0.253757,-0.228107,0.007110,0.063071,-0.059739,0.073786
2,-0.178302,-0.517851,-0.224342,-0.729727,-0.162195,-0.209389,-0.243257,-0.470614,-0.358797,-0.188556,...,-0.265715,-0.264661,-0.455395,-0.263826,-0.255614,-0.323819,0.007110,0.063071,-0.059739,0.073786
3,-0.178280,-0.375803,-0.223981,-0.385835,0.343213,-0.209360,-0.244811,-0.464363,-0.129742,-0.187047,...,-0.161633,-0.151567,0.832964,-0.253708,-0.236141,-0.281717,-0.968646,-1.528009,0.002553,-0.169691
4,-0.178030,0.393779,-0.223855,-0.137609,1.212573,-0.209318,4.905893,2.569003,1.323787,-0.184953,...,-0.285058,-0.317789,-0.466421,-0.265439,-0.254904,-0.169747,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0.198956,-0.526851,0.979379,-0.594531,-0.429889,-0.201291,-0.244975,-0.471995,-0.360122,-0.184072,...,0.367565,0.833365,-0.147515,3.042131,0.044801,-0.188965,0.007110,0.063071,-0.059739,0.073786
792,0.018531,-0.162753,0.481206,0.670652,-0.429505,-0.190066,-0.237247,-0.270271,0.260414,-0.146905,...,-0.092712,-0.073844,-0.302482,-0.021024,-0.050756,-0.149935,-0.680161,0.565370,-2.209417,0.361832
793,-0.178362,-0.528672,-0.224362,-0.726840,-0.429889,-0.209386,-0.244985,-0.470701,-0.360351,-0.188561,...,-0.277991,-0.332510,-0.463221,-0.264702,-0.256798,-0.348923,0.007110,0.063071,-0.059739,0.073786
794,0.076135,-0.512977,0.433604,-0.214048,-0.429695,-0.193048,-0.241100,0.658683,-0.340889,-0.169407,...,1.220882,0.182769,-0.101256,-0.004808,-0.217325,-0.007000,0.007110,0.063071,-0.059739,0.073786


In [85]:
# Set model names to numbers

In [86]:
col_lab = list(range(0,1384))
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = GTEX_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [87]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,-0.178355,-0.485832,-0.224357,-0.385733,-0.429759,-0.209388,-0.244968,-0.468064,-0.360461,-0.188559,...,-0.289796,-0.205919,-0.389517,-0.261726,-0.260624,-0.351443,-0.909252,1.207975,-0.986787,1.188003
1,-0.178352,-0.246669,-0.221942,0.956891,3.155623,-0.209369,-0.240326,0.142329,2.235803,-0.153395,...,-0.169110,0.110825,0.377795,-0.217207,-0.253757,-0.228107,0.007110,0.063071,-0.059739,0.073786
2,-0.178302,-0.517851,-0.224342,-0.729727,-0.162195,-0.209389,-0.243257,-0.470614,-0.358797,-0.188556,...,-0.265715,-0.264661,-0.455395,-0.263826,-0.255614,-0.323819,0.007110,0.063071,-0.059739,0.073786
3,-0.178280,-0.375803,-0.223981,-0.385835,0.343213,-0.209360,-0.244811,-0.464363,-0.129742,-0.187047,...,-0.161633,-0.151567,0.832964,-0.253708,-0.236141,-0.281717,-0.968646,-1.528009,0.002553,-0.169691
4,-0.178030,0.393779,-0.223855,-0.137609,1.212573,-0.209318,4.905893,2.569003,1.323787,-0.184953,...,-0.285058,-0.317789,-0.466421,-0.265439,-0.254904,-0.169747,-0.748040,1.241649,-0.864647,-0.883615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,0.198956,-0.526851,0.979379,-0.594531,-0.429889,-0.201291,-0.244975,-0.471995,-0.360122,-0.184072,...,0.367565,0.833365,-0.147515,3.042131,0.044801,-0.188965,0.007110,0.063071,-0.059739,0.073786
792,0.018531,-0.162753,0.481206,0.670652,-0.429505,-0.190066,-0.237247,-0.270271,0.260414,-0.146905,...,-0.092712,-0.073844,-0.302482,-0.021024,-0.050756,-0.149935,-0.680161,0.565370,-2.209417,0.361832
793,-0.178362,-0.528672,-0.224362,-0.726840,-0.429889,-0.209386,-0.244985,-0.470701,-0.360351,-0.188561,...,-0.277991,-0.332510,-0.463221,-0.264702,-0.256798,-0.348923,0.007110,0.063071,-0.059739,0.073786
794,0.076135,-0.512977,0.433604,-0.214048,-0.429695,-0.193048,-0.241100,0.658683,-0.340889,-0.169407,...,1.220882,0.182769,-0.101256,-0.004808,-0.217325,-0.007000,0.007110,0.063071,-0.059739,0.073786


In [88]:
GTEX_abs_diff_plus_log_odds_and_cons_scores_norm = GTEX_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [89]:
GTEX_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/GTEx_eQTL_dataset_XY-named.txt", sep='\t')

In [90]:
y = GTEX_prvcs['label']

In [91]:
GTEX_pred = test_estimator.predict(GTEX_abs_diff_plus_log_odds_and_cons_scores_norm)

In [92]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, GTEX_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, GTEX_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, GTEX_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , GTEX_pred)))

Accuracy_model_REG: 0.5101
F1_model_REG: 0.6750
MCC_model_REG: 0.0361
ROC_AUC_model_REG: 0.5013


Load Somatic eQTL dataset

In [106]:
SOMEQTL_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/SOMEQTL_0_7513-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [107]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Somatic_eQTL_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(           Name  priPhCons  priPhyloP  GerpN  GerpS
 0  SOMEQTL00001      0.236      0.094   0.00  0.000
 1  SOMEQTL00002      0.006     -0.398   3.13 -1.420
 2  SOMEQTL00003      0.008     -0.821   4.49 -3.810
 3  SOMEQTL00004      0.008     -0.146   3.32  1.450
 4  SOMEQTL00005      0.010     -0.357   2.51  0.611,
 (7513, 5))

In [108]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [109]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.236,0.094,0.00,0.000
1,0.006,-0.398,3.13,-1.420
2,0.008,-0.821,4.49,-3.810
3,0.008,-0.146,3.32,1.450
4,0.010,-0.357,2.51,0.611
...,...,...,...,...
7508,0.266,0.124,0.81,-0.189
7509,0.505,0.487,3.68,1.190
7510,0.014,-0.243,3.68,-1.990
7511,0.007,-0.483,2.97,0.243


In [110]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))

In [111]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,1.213221,0.183020,-2.072384,0.268571
1,-0.453869,-0.510635,0.063053,-0.252106
2,-0.439372,-1.107009,0.990910,-1.128457
3,-0.439372,-0.155348,0.192680,0.800248
4,-0.424876,-0.452830,-0.359941,0.492609
...,...,...,...,...
7508,1.430667,0.225316,-1.519763,0.199270
7509,3.162991,0.737098,0.438289,0.704913
7510,-0.395883,-0.292105,0.438289,-0.461110
7511,-0.446620,-0.630474,-0.046107,0.357673


In [112]:
SOMEQTL_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwMcf7CtcfUniPk151-ran.1,UwNb4CtcfUniPk151-ran.1,UwNhdfneoCtcfUniPk151-ran.1,UwNhekCtcfUniPk151-ran.1,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1
0,-0.144903,0.195737,-0.182222,-0.108920,0.215997,-0.163565,-0.275462,-0.144383,-0.338155,-0.164474,...,-0.310175,-0.211341,-0.344102,-0.265758,-0.293365,-0.196343,-0.520804,-0.231279,-0.203031,-0.333002
1,-0.144924,-0.568833,-0.182462,-0.683622,-0.434228,-0.163565,-0.275869,-0.512477,-0.357202,-0.164471,...,-0.342034,-0.200050,-0.315634,-0.105513,-0.296193,-0.337912,-0.506100,-0.237790,-0.215480,-0.324308
2,-0.144511,-0.200158,-0.181925,-0.358322,-0.394926,-0.163529,-0.275796,-0.468553,-0.354824,-0.158311,...,0.963121,-0.209746,-0.429988,-0.135167,-0.273474,0.019215,-0.320742,0.107533,-0.203012,-0.307684
3,-0.144869,-0.526162,-0.180201,1.135465,0.723396,-0.163565,-0.237328,0.449306,-0.355735,-0.163939,...,-0.307590,-0.185465,-0.407711,-0.259176,-0.181607,-0.258805,-0.168069,-0.238298,-0.211773,-0.291270
4,-0.144908,-0.247975,-0.141101,-0.480016,-0.216463,-0.150550,-0.260551,0.054881,-0.087934,-0.103180,...,0.738351,0.587462,-0.143264,1.144424,0.048478,-0.293651,-0.452174,-0.230422,0.445050,0.516132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7508,-0.144305,-0.367277,-0.182471,-0.244942,-0.415514,-0.163563,-0.243906,-0.393012,-0.113778,-0.149147,...,-0.336289,-0.215802,-0.280409,-0.196786,-0.278953,-0.347581,0.610747,-0.238620,-0.216403,-0.295181
7509,-0.144923,-0.547270,-0.182243,1.006641,-0.432987,-0.163565,-0.275755,1.251890,-0.357355,-0.164476,...,-0.336786,-0.217042,-0.349688,-0.267488,-0.205523,-0.333389,-0.438623,-0.235375,-0.209342,-0.246323
7510,-0.144899,-0.591999,-0.182477,-0.479671,-0.434303,-0.163564,-0.275865,-0.517299,-0.357452,-0.164468,...,-0.328957,-0.143269,-0.103965,-0.264673,-0.296925,-0.072538,-0.204852,-0.234325,-0.212516,-0.237920
7511,-0.143923,0.127016,-0.182166,0.305013,-0.262310,-0.163462,-0.275867,-0.508464,-0.357306,-0.164449,...,-0.300933,-0.219915,-0.351700,0.076195,-0.290894,0.259330,-0.246323,-0.233290,-0.214649,-0.301657


In [113]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm = pd.concat([SOMEQTL_abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [115]:
col_lab = list(range(0,1384))
SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm = SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm.set_axis(col_lab, axis="columns")

In [116]:
SOMEQTL_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/test_dataset/Somatic_eQTL_dataset_XY-named.txt", sep='\t')

In [117]:
y = SOMEQTL_prvcs['label']

In [119]:
SOMEQTL_pred = test_estimator.predict(SOMEQTL_abs_diff_plus_log_odds_and_cons_scores_norm)

In [120]:
print("Accuracy_model_REG: %.4f" % (accuracy_score(y, SOMEQTL_pred)))  
print("F1_model_REG: %.4f" % (f1_score(y, SOMEQTL_pred)))  
print("MCC_model_REG: %.4f" % (matthews_corrcoef(y, SOMEQTL_pred)))  
print("ROC_AUC_model_REG: %.4f" % (roc_auc_score(y , SOMEQTL_pred)))

Accuracy_model_REG: 0.5095
F1_model_REG: 0.6535
MCC_model_REG: 0.0344
ROC_AUC_model_REG: 0.5096


In [None]:
test_estimator = load('/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_REG_CADD_cons.joblib')

In [None]:
test_pred = test_estimator.predict(X_test)

In [None]:
test_pred.sum()/3148

In [None]:
# Load SNP data

In [None]:
SNPlt01_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/lt05_igap_ran_0_10000-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [None]:
# Set model names to numbers

In [None]:
SNPlt01_abs_diff_plus_log_odds_scores_norm.set_axis(col_lab, axis="columns", inplace=True)

In [None]:
SNPlt01_abs_diff_plus_log_odds_scores_norm

In [None]:
SNPlt01_pred = test_estimator.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)

In [None]:
SNPlt01_pred.sum()

In [None]:
SNPgt5_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/gt5_igap_ran_0_10000-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [None]:
SNPgt5_abs_diff_plus_log_odds_scores_norm.set_axis(col_lab, axis="columns", inplace=True)

In [None]:
SNPgt5_abs_diff_plus_log_odds_scores_norm

In [None]:
SNPgt5_pred = test_estimator.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)

In [None]:
SNPgt5_pred.sum()

In [None]:
# Try other models

In [None]:
pred1_lt01 = train_model1.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm) 
pred2_lt01 = train_model2.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm) 
pred3_lt01 = train_model3.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred4_lt01 = train_model4.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred_deepsea1_lt01 = train_model_deepsea1.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred_deepsea2_lt01 = train_model_deepsea2.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)

In [None]:
for i in [
    pred1_lt01,
    pred2_lt01,
    pred3_lt01,
    pred4_lt01,
    pred_deepsea1_lt01,
    pred_deepsea2_lt01,
  ]:
    print(i.sum())

In [None]:
pred1_gt5 = train_model1.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm) 
pred2_gt5 = train_model2.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm) 
pred3_gt5 = train_model3.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred4_gt5 = train_model4.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred_deepsea1_gt5 = train_model_deepsea1.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred_deepsea2_gt5 = train_model_deepsea2.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)

In [None]:
for i in [
    pred1_gt5,
    pred2_gt5,
    pred3_gt5,
    pred4_gt5,
    pred_deepsea1_gt5,
    pred_deepsea2_gt5,
  ]:
    print(i.sum())