## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [None]:
# already set up on Expanse; toggle for colab

# pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] pyfaidx

Set path prefix for Expanse

In [1]:
path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

Import SNP datasets

In [2]:
import pandas as pd

abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/SNP_PRVCS_CAN_0_5023-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')
abs_diff_plus_log_odds_scores_norm.head(), abs_diff_plus_log_odds_scores_norm.shape


(   BroadDnd41CtcfUniPk151-ran  BroadDnd41Ezh239875UniPk151-ran  \
 0                    0.306653                        -0.556821   
 1                   -0.111710                        -0.113748   
 2                    0.090462                        -0.502083   
 3                   -0.143991                        -0.349744   
 4                   -0.144492                        -0.409669   
 
    BroadGm12878CtcfUniPk151-ran  BroadGm12878Ezh239875UniPk151-ran  \
 0                      0.018333                          -0.521790   
 1                      0.118644                           1.831607   
 2                     -0.059683                          -0.682626   
 3                     -0.189783                          -0.485974   
 4                     -0.191357                           0.003039   
 
    BroadH1hescChd1a301218aUniPk151-ran  BroadH1hescCtcfUniPk151-ran  \
 0                            -0.397740                     0.304906   
 1                      

Choose test set for run

In [3]:
cons_scores = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/CAN_dataset_XY-named-plus-rs-dscons.csv", sep=',')
cons_scores.head(), cons_scores.shape

(           Name  priPhCons  priPhyloP  GerpN  GerpS
 0  SNP_CAN_0001      0.018      0.457  0.893 -0.294
 1  SNP_CAN_0002      0.014      0.418  0.582  0.582
 2  SNP_CAN_0003      0.002      0.152  0.472 -0.774
 3  SNP_CAN_0004      0.046      0.299  1.760 -0.370
 4  SNP_CAN_0005      0.001     -0.353  1.910 -0.263,
 (5023, 5))

In [4]:
cons_scores

Unnamed: 0,Name,priPhCons,priPhyloP,GerpN,GerpS
0,SNP_CAN_0001,0.018,0.457,0.893,-0.294
1,SNP_CAN_0002,0.014,0.418,0.582,0.582
2,SNP_CAN_0003,0.002,0.152,0.472,-0.774
3,SNP_CAN_0004,0.046,0.299,1.760,-0.370
4,SNP_CAN_0005,0.001,-0.353,1.910,-0.263
...,...,...,...,...,...
5018,SNP_CAN_5019,0.037,0.110,0.000,0.000
5019,SNP_CAN_5020,0.004,-0.169,4.280,0.110
5020,SNP_CAN_5021,0.003,0.000,0.000,0.000
5021,SNP_CAN_5022,0.003,-2.428,3.470,-6.950


In [5]:
cons_scores = cons_scores.drop(['Name'], axis=1)

In [6]:
cons_scores

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,0.018,0.457,0.893,-0.294
1,0.014,0.418,0.582,0.582
2,0.002,0.152,0.472,-0.774
3,0.046,0.299,1.760,-0.370
4,0.001,-0.353,1.910,-0.263
...,...,...,...,...
5018,0.037,0.110,0.000,0.000
5019,0.004,-0.169,4.280,0.110
5020,0.003,0.000,0.000,0.000
5021,0.003,-2.428,3.470,-6.950


Normalize conservation scores

In [7]:
from scipy.stats import zscore

cons_scores_norm = cons_scores.apply(lambda x: zscore(x, ddof=1))



In [8]:
cons_scores_norm

Unnamed: 0,priPhCons,priPhyloP,GerpN,GerpS
0,-0.415869,0.746444,-0.830495,0.063581
1,-0.439525,0.696662,-1.035700,0.472045
2,-0.510494,0.357125,-1.108281,-0.160234
3,-0.250275,0.544764,-0.258426,0.028144
4,-0.516408,-0.287485,-0.159452,0.078036
...,...,...,...,...
5018,-0.303502,0.303514,-1.419719,0.200669
5019,-0.498666,-0.052617,1.404333,0.251960
5020,-0.504580,0.163104,-1.419719,0.200669
5021,-0.504580,-2.936131,0.869875,-3.039995


In [9]:
abs_diff_plus_log_odds_scores_norm.shape,cons_scores.shape

((5023, 1380), (5023, 4))

In [10]:
# abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores], axis=1)
abs_diff_plus_log_odds_and_cons_scores = pd.concat([abs_diff_plus_log_odds_scores_norm,cons_scores_norm], axis=1)

In [11]:
import xgboost

In [12]:
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot

In [13]:
X=abs_diff_plus_log_odds_and_cons_scores[0:5023]

In [14]:
X

Unnamed: 0,BroadDnd41CtcfUniPk151-ran,BroadDnd41Ezh239875UniPk151-ran,BroadGm12878CtcfUniPk151-ran,BroadGm12878Ezh239875UniPk151-ran,BroadH1hescChd1a301218aUniPk151-ran,BroadH1hescCtcfUniPk151-ran,BroadH1hescEzh239875UniPk151-ran,BroadH1hescJarid1aab26049UniPk151-ran,BroadH1hescRbbp5a300109aUniPk151-ran,BroadHelas3CtcfUniPk151-ran,...,UwNhlfCtcfUniPk151-ran.1,UwRptecCtcfUniPk151-ran.1,UwSaecCtcfUniPk151-ran.1,UwSknshraCtcfUniPk151-ran.1,UwWerirb1CtcfUniPk151-ran.1,UwWi38CtcfUniPk151-ran.1,priPhCons,priPhyloP,GerpN,GerpS
0,0.306653,-0.556821,0.018333,-0.521790,-0.397740,0.304906,-0.250733,-0.502371,-0.331445,0.104589,...,0.458727,0.510215,-0.314536,0.241683,0.519838,1.156624,-0.415869,0.746444,-0.830495,0.063581
1,-0.111710,-0.113748,0.118644,1.831607,-0.396543,-0.122070,-0.250106,-0.450897,-0.332165,4.180299,...,1.378727,-0.102556,-0.500327,0.202275,0.561808,0.526119,-0.439525,0.696662,-1.035700,0.472045
2,0.090462,-0.502083,-0.059683,-0.682626,-0.397647,-0.163870,-0.250740,-0.502466,-0.332611,-0.136593,...,-0.198377,-0.124777,-0.306719,0.043669,-0.174972,-0.154073,-0.510494,0.357125,-1.108281,-0.160234
3,-0.143991,-0.349744,-0.189783,-0.485974,-0.391880,-0.164242,-0.250665,-0.300571,-0.328237,-0.139470,...,0.305962,-0.300284,0.023444,-0.226416,-0.159670,3.054452,-0.250275,0.544764,-0.258426,0.028144
4,-0.144492,-0.409669,-0.191357,0.003039,-0.373176,-0.164270,-0.250392,-0.486401,-0.285498,-0.156234,...,-0.268203,-0.142192,-0.343744,-0.228562,-0.197016,-0.247948,-0.516408,-0.287485,-0.159452,0.078036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5018,-0.138860,-0.543929,-0.073434,-0.687548,-0.397524,-0.155174,-0.250739,-0.495441,-0.332608,-0.130375,...,0.257663,0.429276,0.494833,4.729899,4.630994,-0.338688,-0.303502,0.303514,-1.419719,0.200669
5019,4.783827,1.997961,0.333336,1.850811,6.342943,2.034023,-0.247243,5.612971,-0.324075,3.024356,...,1.639358,1.499459,1.607354,2.046898,1.752810,3.238786,-0.498666,-0.052617,1.404333,0.251960
5020,-0.140555,-0.553483,-0.152443,0.561306,0.647902,0.057787,-0.250562,0.076153,-0.331908,-0.037337,...,-0.158225,0.533938,-0.514059,-0.218508,0.153822,0.420841,-0.504580,0.163104,-1.419719,0.200669
5021,-0.143869,0.203208,-0.176924,1.938559,0.238020,-0.164218,-0.167718,-0.086010,0.060280,-0.152157,...,-0.241317,-0.117355,-0.343776,-0.205415,0.074402,-0.248003,-0.504580,-2.936131,0.869875,-3.039995


In [15]:
col_lab = list(range(0,1384))
X = X.set_axis(col_lab, axis="columns")

In [16]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383
0,0.306653,-0.556821,0.018333,-0.521790,-0.397740,0.304906,-0.250733,-0.502371,-0.331445,0.104589,...,0.458727,0.510215,-0.314536,0.241683,0.519838,1.156624,-0.415869,0.746444,-0.830495,0.063581
1,-0.111710,-0.113748,0.118644,1.831607,-0.396543,-0.122070,-0.250106,-0.450897,-0.332165,4.180299,...,1.378727,-0.102556,-0.500327,0.202275,0.561808,0.526119,-0.439525,0.696662,-1.035700,0.472045
2,0.090462,-0.502083,-0.059683,-0.682626,-0.397647,-0.163870,-0.250740,-0.502466,-0.332611,-0.136593,...,-0.198377,-0.124777,-0.306719,0.043669,-0.174972,-0.154073,-0.510494,0.357125,-1.108281,-0.160234
3,-0.143991,-0.349744,-0.189783,-0.485974,-0.391880,-0.164242,-0.250665,-0.300571,-0.328237,-0.139470,...,0.305962,-0.300284,0.023444,-0.226416,-0.159670,3.054452,-0.250275,0.544764,-0.258426,0.028144
4,-0.144492,-0.409669,-0.191357,0.003039,-0.373176,-0.164270,-0.250392,-0.486401,-0.285498,-0.156234,...,-0.268203,-0.142192,-0.343744,-0.228562,-0.197016,-0.247948,-0.516408,-0.287485,-0.159452,0.078036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5018,-0.138860,-0.543929,-0.073434,-0.687548,-0.397524,-0.155174,-0.250739,-0.495441,-0.332608,-0.130375,...,0.257663,0.429276,0.494833,4.729899,4.630994,-0.338688,-0.303502,0.303514,-1.419719,0.200669
5019,4.783827,1.997961,0.333336,1.850811,6.342943,2.034023,-0.247243,5.612971,-0.324075,3.024356,...,1.639358,1.499459,1.607354,2.046898,1.752810,3.238786,-0.498666,-0.052617,1.404333,0.251960
5020,-0.140555,-0.553483,-0.152443,0.561306,0.647902,0.057787,-0.250562,0.076153,-0.331908,-0.037337,...,-0.158225,0.533938,-0.514059,-0.218508,0.153822,0.420841,-0.504580,0.163104,-1.419719,0.200669
5021,-0.143869,0.203208,-0.176924,1.938559,0.238020,-0.164218,-0.167718,-0.086010,0.060280,-0.152157,...,-0.241317,-0.117355,-0.343776,-0.205415,0.074402,-0.248003,-0.504580,-2.936131,0.869875,-3.039995


In [17]:
import pandas as pd

snp_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/CAN_dataset_XY-named.csv", sep=',')


In [18]:
y = snp_prvcs['label'][0:5023]

In [19]:
y

0       0
1       0
2       0
3       0
4       0
       ..
5018    1
5019    1
5020    1
5021    1
5022    1
Name: label, Length: 5023, dtype: int64

In [20]:
y[2500:2510]

2500    0
2501    0
2502    0
2503    0
2504    0
2505    0
2506    1
2507    1
2508    1
2509    1
Name: label, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
test_size = 0.3
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [22]:
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

model1 = XGBClassifier() 
model2 = XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5) 
train_model1 = model1.fit(X_train, y_train) 
train_model2 = model2.fit(X_train, y_train) 
pred1 = train_model1.predict(X_test) 
pred2 = train_model2.predict(X_test) 
print("Accuracy_model1: %.4f" % (accuracy_score(y_test, pred1))) 
print("Accuracy_model2: %.4f" % (accuracy_score(y_test, pred2))) 
print("F1_model1: %.4f" % (f1_score(y_test, pred1)))
print("F1_model2: %.4f" % (f1_score(y_test, pred2)))
print("MCC_model1: %.4f" % (matthews_corrcoef(y_test, pred1)))
print("MCC_model2: %.4f" % (matthews_corrcoef(y_test, pred2)))
print("ROC_AUC_model1: %.4f" % (roc_auc_score(y_test, pred1))) # don't know about call: y_score vs. pred
print("ROC_AUC_model2: %.4f" % (roc_auc_score(y_test, pred2))) # don't know about call: y_score vs. pred


Accuracy_model1: 0.7605
Accuracy_model2: 0.7472
F1_model1: 0.7532
F1_model2: 0.7389
MCC_model1: 0.5271
MCC_model2: 0.5009
ROC_AUC_model1: 0.7625
ROC_AUC_model2: 0.7493


In [23]:
pred2.sum()

678

In [24]:
model3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=7)

train_model3 = model3.fit(X_train, y_train)
pred3 = train_model3.predict(X_test)
print("Accuracy_model3: %.4f" % (accuracy_score(y_test, pred3)))
print("F1_model3: %.4f" % (f1_score(y_test, pred3)))
print("MCC_model3: %.4f" % (matthews_corrcoef(y_test, pred3)))
print("ROC_AUC_model3: %.4f" % (roc_auc_score(y_test, pred3))) # don't know about call: y_score vs. pred

Accuracy_model3: 0.7598
F1_model3: 0.7510
MCC_model3: 0.5268
ROC_AUC_model3: 0.7621


In [25]:
from sklearn.model_selection import GridSearchCV

param_test = {
 'max_depth':[10],
 'min_child_weight':[1],
 'learning_rate':[0.1],
 'n_estimators':[1000],
 'objective':['binary:logistic']
 #'objective':['binary:logistic','binary:hinge','binary:logitraw']
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(gamma=0, 
                                               subsample=0.8, 
                                               colsample_bytree=0.8, 
                                               nthread=4, 
                                               scale_pos_weight=1, 
                                               seed=7), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=5,
                       refit=True)

train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_test)
print("Accuracy_model4: %.4f" % (accuracy_score(y_test, pred4)))  
print("F1_model4: %.4f" % (f1_score(y_test, pred4)))  
print("MCC_model4: %.4f" % (matthews_corrcoef(y_test, pred4)))  
print("ROC_AUC_model4: %.4f" % (roc_auc_score(y_test, pred4)))  

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy_model4: 0.7711
F1_model4: 0.7616
MCC_model4: 0.5505
ROC_AUC_model4: 0.7736


In [26]:
pred4.sum()

666

In [27]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 1000,
 'objective': 'binary:logistic'}

In [28]:
model_deepsea1 = XGBClassifier(
 reg_alpha = 20,
 reg_lambda = 2000,
 eta = 0.1,
 n_estimators=10,
 objective= 'binary:logistic',
 seed=7)

train_model_deepsea1 = model_deepsea1.fit(X_train, y_train)
pred_deepsea1 = train_model_deepsea1.predict(X_test)
print("Accuracy_model_deepsea1: %.4f" % (accuracy_score(y_test, pred_deepsea1)))
print("F1_model_deepsea1: %.4f" % (f1_score(y_test, pred_deepsea1)))
print("MCC_model_deepsea1: %.4f" % (matthews_corrcoef(y_test, pred_deepsea1)))
print("ROC_AUC_model_deepsea1: %.4f" % (roc_auc_score(y_test, pred_deepsea1))) # don't know about call: y_score vs. pred

Accuracy_model_deepsea1: 0.7007
F1_model_deepsea1: 0.6360
MCC_model_deepsea1: 0.4523
ROC_AUC_model_deepsea1: 0.7082


In [29]:
model_deepsea2 = XGBClassifier(
 reg_alpha = 0,
 reg_lambda = 10,
 eta = 0.1,
 n_estimators=100,
 objective= 'binary:logistic',
 seed=7)

train_model_deepsea2 = model_deepsea2.fit(X_train, y_train)
pred_deepsea2 = train_model_deepsea2.predict(X_test)
print("Accuracy_model_deepsea1: %.4f" % (accuracy_score(y_test, pred_deepsea2)))
print("F1_model_deepsea2: %.4f" % (f1_score(y_test, pred_deepsea2)))
print("MCC_model_deepsea2: %.4f" % (matthews_corrcoef(y_test, pred_deepsea2)))
print("ROC_AUC_model_deepsea2: %.4f" % (roc_auc_score(y_test, pred_deepsea2))) # don't know about call: y_score vs. pred

Accuracy_model_deepsea1: 0.7664
F1_model_deepsea2: 0.7592
MCC_model_deepsea2: 0.5392
ROC_AUC_model_deepsea2: 0.7685


In [30]:
model_regbase = XGBClassifier(
    colsample_bylevel = 0.5,
    colsample_bytree = 0.8,
    gamma = 0.0,
    learning_rate = 0.1,
    max_depth = 9,
    min_child_weight = 1,
    n_estimators = 200,
    reg_alpha = 0.0,
    reg_lambda = 1,
    scale_pos_weight = 0.9956297179,
    subsample = 0.8)

train_model_regbase = model_regbase.fit(X_train, y_train)
pred_regbase = train_model_regbase.predict(X_test)
print("Accuracy_model_regbase: %.4f" % (accuracy_score(y_test, pred_regbase)))
print("F1_model_regbase: %.4f" % (f1_score(y_test, pred_deepsea2)))
print("MCC_model_regbase: %.4f" % (matthews_corrcoef(y_test, pred_regbase)))
print("ROC_AUC_model_regbase: %.4f" % (roc_auc_score(y_test, pred_regbase)))

Accuracy_model_regbase: 0.7678
F1_model_regbase: 0.7592
MCC_model_regbase: 0.5439
ROC_AUC_model_regbase: 0.7703


In [31]:
import xgboost as xgb

data_matrix = xgb.DMatrix(data=X, label=y)

params = {
    "colsample_bylevel": 0.5,
    "colsample_bytree": 0.8,
    "gamma": 0.0,
    "learning_rate": 0.1,
    "max_depth": 9,
    "min_child_weight": 1,
    "alpha": 0.0,
    "lambda": 1,
    "scale_pos_weight": 0.9956297179,
    "subsample": 0.8
}

xgb_cv = xgb.cv(dtrain=data_matrix, params=params, nfold=10,
                    num_boost_round=100, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=7)


In [32]:
xgb_cv

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.898111,0.003157,0.715711,0.024375
1,0.945889,0.004165,0.758423,0.020738
2,0.963584,0.002797,0.782267,0.020726
3,0.973136,0.003062,0.795568,0.026470
4,0.982744,0.001591,0.808185,0.026500
...,...,...,...,...
76,1.000000,0.000000,0.839461,0.021841
77,1.000000,0.000000,0.839426,0.021814
78,1.000000,0.000000,0.839389,0.021783
79,1.000000,0.000000,0.839383,0.021793


In [33]:
model_regbase.set_params(n_estimators=xgb_cv.shape[0])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.5, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=81, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [34]:
train_model_regbase_cv = model_regbase.fit(X_train,y_train)
pred_regbase_cv = train_model_regbase.predict(X_test)
print("Accuracy_model_regbase: %.4f" % (accuracy_score(y_test, pred_regbase_cv)))
print("F1_model_regbase: %.4f" % (f1_score(y_test, pred_regbase_cv)))
print("MCC_model_regbase: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv)))
print("ROC_AUC_model_regbase: %.4f" % (roc_auc_score(y_test, pred_regbase_cv)))

Accuracy_model_regbase: 0.7571
F1_model_regbase: 0.7458
MCC_model_regbase: 0.5233
ROC_AUC_model_regbase: 0.7598


In [35]:
param_test = {
 'max_depth': range(2,10,2),
 'min_child_weight': range(2,10,2)
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv3 = gsearch.fit(X_train, y_train)
pred_regbase_cv3 = train_model_regbase_cv3.predict(X_test)
print("Accuracy_model_cv3: %.4f" % (accuracy_score(y_test, pred_regbase_cv3)))  
print("F1_model_cv3: %.4f" % (f1_score(y_test, pred_regbase_cv3)))  
print("MCC_model_cv3: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv3)))  
print("ROC_AUC_model_cv3: %.4f" % (roc_auc_score(y_test, pred_regbase_cv3)))  

Fitting 10 folds for each of 16 candidates, totalling 160 fits
Accuracy_model_cv3: 0.7571
F1_model_cv3: 0.7486
MCC_model_cv3: 0.5213
ROC_AUC_model_cv3: 0.7594


In [36]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'max_depth': 6, 'min_child_weight': 4}

In [37]:
param_test = {
 'max_depth': [7,8,9],
 'min_child_weight': [5,6,7]
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv4 = gsearch.fit(X_train, y_train)
pred_regbase_cv4 = train_model_regbase_cv4.predict(X_test)
print("Accuracy_model_cv4: %.4f" % (accuracy_score(y_test, pred_regbase_cv4)))  
print("F1_model_cv4: %.4f" % (f1_score(y_test, pred_regbase_cv4)))  
print("MCC_model_cv4: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv4)))  
print("ROC_AUC_model_cv4: %.4f" % (roc_auc_score(y_test, pred_regbase_cv4)))  

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Accuracy_model_cv4: 0.7558
F1_model_cv4: 0.7466
MCC_model_cv4: 0.5191
ROC_AUC_model_cv4: 0.7581


In [38]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'max_depth': 7, 'min_child_weight': 6}

In [39]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,8.724663,0.086914,0.108568,0.003184,7,5,"{'max_depth': 7, 'min_child_weight': 5}",0.830331,0.839985,0.851156,...,0.99998,0.999994,0.999996,0.99998,0.999998,0.999994,0.999992,0.999997,0.999992,6.678272e-06
1,8.313191,0.091563,0.109462,0.004335,7,6,"{'max_depth': 7, 'min_child_weight': 6}",0.830524,0.838209,0.850898,...,0.999981,0.999957,0.999966,0.99997,0.99995,0.99999,0.999955,0.999986,0.999969,1.352756e-05
2,7.920521,0.102772,0.109191,0.003358,7,7,"{'max_depth': 7, 'min_child_weight': 7}",0.827296,0.838306,0.837563,...,0.999943,0.999793,0.999782,0.999815,0.999928,0.999971,0.999856,0.999933,0.999881,6.558065e-05
3,9.912339,0.106374,0.108796,0.002959,8,5,"{'max_depth': 8, 'min_child_weight': 5}",0.833075,0.836207,0.846991,...,0.999998,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.793685e-07
4,9.407863,0.084919,0.108595,0.002777,8,6,"{'max_depth': 8, 'min_child_weight': 6}",0.832365,0.824164,0.83921,...,0.999966,0.999988,0.999997,0.999953,1.0,0.999998,0.999999,1.0,0.999989,1.536459e-05
5,8.942776,0.099596,0.109043,0.0029,8,7,"{'max_depth': 8, 'min_child_weight': 7}",0.830944,0.828393,0.849542,...,0.999976,0.999979,0.999949,0.999994,0.999995,0.999999,0.999987,0.999995,0.999976,2.142727e-05
6,10.847232,0.12127,0.109554,0.003717,9,5,"{'max_depth': 9, 'min_child_weight': 5}",0.844537,0.842019,0.845732,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.198886e-07
7,10.151652,0.077869,0.108173,0.002536,9,6,"{'max_depth': 9, 'min_child_weight': 6}",0.83314,0.831493,0.835949,...,0.999994,0.999998,0.999993,0.999997,1.0,1.0,1.0,1.0,0.999993,1.311635e-05
8,9.457695,0.28591,0.108043,0.003496,9,7,"{'max_depth': 9, 'min_child_weight': 7}",0.832623,0.837466,0.843568,...,0.999989,0.999991,0.999908,0.999994,0.999999,0.999996,1.0,1.0,0.999965,4.687656e-05


In [40]:
param_test = {
 'gamma': range(2,6),
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv5 = gsearch.fit(X_train, y_train)
pred_regbase_cv5 = train_model_regbase_cv5.predict(X_test)
print("Accuracy_model_cv5: %.4f" % (accuracy_score(y_test, pred_regbase_cv5)))  
print("F1_model_cv5: %.4f" % (f1_score(y_test, pred_regbase_cv5)))  
print("MCC_model_cv5: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv5)))  
print("ROC_AUC_model_cv5: %.4f" % (roc_auc_score(y_test, pred_regbase_cv5))) 

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Accuracy_model_cv5: 0.7605
F1_model_cv5: 0.7505
MCC_model_cv5: 0.5291
ROC_AUC_model_cv5: 0.7629


In [41]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'gamma': 3}

In [42]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,11.078319,0.055553,0.108173,0.001837,2,{'gamma': 2},0.845861,0.847088,0.84444,0.856257,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.598514e-07
1,9.166565,0.083251,0.109516,0.003091,3,{'gamma': 3},0.832397,0.838435,0.851027,0.854352,...,0.999658,0.999887,0.999917,0.999943,0.999943,0.999918,0.999947,0.999943,0.999903,8.347321e-05
2,7.867071,0.072948,0.108865,0.002015,4,{'gamma': 4},0.834657,0.846248,0.830815,0.859325,...,0.998977,0.998955,0.999021,0.9993,0.999175,0.999315,0.999264,0.999433,0.999192,0.0001593651
3,6.900939,0.069519,0.107955,0.002644,5,{'gamma': 5},0.836401,0.843213,0.851414,0.849671,...,0.996968,0.996999,0.996527,0.997376,0.997206,0.997226,0.996882,0.996747,0.996939,0.0002742372


In [43]:
param_test = {
  'subsample': [i/10.0 for i in range(6,10)],
  'colsample_bytree': [i/10.0 for i in range(6,10)]
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv6 = gsearch.fit(X_train, y_train)
pred_regbase_cv6 = train_model_regbase_cv6.predict(X_test)
print("Accuracy_model_cv6: %.4f" % (accuracy_score(y_test, pred_regbase_cv6)))  
print("F1_model_cv6: %.4f" % (f1_score(y_test, pred_regbase_cv6)))  
print("MCC_model_cv6: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv6)))  
print("ROC_AUC_model_cv6: %.4f" % (roc_auc_score(y_test, pred_regbase_cv6))) 

Fitting 10 folds for each of 16 candidates, totalling 160 fits
Accuracy_model_cv6: 0.7644
F1_model_cv6: 0.7533
MCC_model_cv6: 0.5382
ROC_AUC_model_cv6: 0.7671


In [44]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'colsample_bytree': 0.6, 'subsample': 0.8}

In [45]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,15.015433,0.1512,0.109287,0.002905,0.6,0.6,"{'colsample_bytree': 0.6, 'subsample': 0.6}",0.825746,0.825843,0.828555,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,15.719736,0.145094,0.109907,0.002933,0.6,0.7,"{'colsample_bytree': 0.6, 'subsample': 0.7}",0.823227,0.84599,0.826036,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,16.384461,0.12652,0.109255,0.002105,0.6,0.8,"{'colsample_bytree': 0.6, 'subsample': 0.8}",0.841793,0.839823,0.862521,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,16.974409,0.139403,0.109822,0.003353,0.6,0.9,"{'colsample_bytree': 0.6, 'subsample': 0.9}",0.843536,0.840469,0.849025,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,15.614392,0.100951,0.108239,0.002316,0.7,0.6,"{'colsample_bytree': 0.7, 'subsample': 0.6}",0.819902,0.823001,0.825391,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,16.398615,0.148258,0.109346,0.003001,0.7,0.7,"{'colsample_bytree': 0.7, 'subsample': 0.7}",0.835819,0.827296,0.838499,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,17.104644,0.175509,0.108715,0.001769,0.7,0.8,"{'colsample_bytree': 0.7, 'subsample': 0.8}",0.826618,0.843246,0.847314,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,17.77516,0.107562,0.108782,0.002683,0.7,0.9,"{'colsample_bytree': 0.7, 'subsample': 0.9}",0.844892,0.827651,0.839371,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,16.206188,0.097408,0.108721,0.00208,0.8,0.6,"{'colsample_bytree': 0.8, 'subsample': 0.6}",0.817642,0.846991,0.854966,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
9,17.030776,0.155428,0.10866,0.00252,0.8,0.7,"{'colsample_bytree': 0.8, 'subsample': 0.7}",0.832332,0.825294,0.839726,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [46]:
param_test = {
  'reg_alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1],
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv7 = gsearch.fit(X_train, y_train)
pred_regbase_cv7 = train_model_regbase_cv7.predict(X_test)
print("Accuracy_model_cv7: %.4f" % (accuracy_score(y_test, pred_regbase_cv7)))  
print("F1_model_cv7: %.4f" % (f1_score(y_test, pred_regbase_cv7)))  
print("MCC_model_cv7: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv7)))  
print("ROC_AUC_model_cv7: %.4f" % (roc_auc_score(y_test, pred_regbase_cv7))) 

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Accuracy_model_cv7: 0.7638
F1_model_cv7: 0.7531
MCC_model_cv7: 0.5364
ROC_AUC_model_cv7: 0.7664


In [47]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'reg_alpha': 0.1}

In [48]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,17.717165,0.129438,0.109294,0.002516,1e-05,{'reg_alpha': 1e-05},0.840695,0.82933,0.847411,0.858582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,17.777279,0.204361,0.109769,0.002763,0.0001,{'reg_alpha': 0.0001},0.837466,0.831009,0.847443,0.858582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,17.71399,0.084833,0.110335,0.002758,0.001,{'reg_alpha': 0.001},0.840081,0.832139,0.850058,0.866073,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,17.778348,0.174706,0.110358,0.00294,0.01,{'reg_alpha': 0.01},0.833043,0.838661,0.843213,0.869172,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,17.879943,0.102986,0.109256,0.002186,0.1,{'reg_alpha': 0.1},0.852964,0.840372,0.841922,0.85968,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,19.258995,0.079135,0.108719,0.00336,1.0,{'reg_alpha': 1},0.840049,0.825552,0.834076,0.866234,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [49]:
param_test = {
  'reg_alpha':[0.0, 1e-4, 0.5e-3, 1e-3, 8e-2, 6e-2],
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv8 = gsearch.fit(X_train, y_train)
pred_regbase_cv8 = train_model_regbase_cv8.predict(X_test)
print("Accuracy_model_cv8: %.4f" % (accuracy_score(y_test, pred_regbase_cv8)))  
print("F1_model_cv8: %.4f" % (f1_score(y_test, pred_regbase_cv8)))  
print("MCC_model_cv8: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv8)))  
print("ROC_AUC_model_cv8: %.4f" % (roc_auc_score(y_test, pred_regbase_cv8))) 

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Accuracy_model_cv8: 0.7598
F1_model_cv8: 0.7507
MCC_model_cv8: 0.5271
ROC_AUC_model_cv8: 0.7621


In [50]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'reg_alpha': 0.001}

In [51]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,17.779086,0.152229,0.109295,0.002058,0.0,{'reg_alpha': 0.0},0.840695,0.82933,0.847411,0.858582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,17.833575,0.185562,0.110641,0.002242,0.0001,{'reg_alpha': 0.0001},0.837466,0.831009,0.847443,0.858582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,17.806952,0.110764,0.110996,0.002354,0.0005,{'reg_alpha': 0.0005},0.83427,0.832171,0.850768,0.858808,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,17.756059,0.133363,0.110756,0.001712,0.001,{'reg_alpha': 0.001},0.840081,0.832139,0.850058,0.866073,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,17.921383,0.149967,0.11014,0.001958,0.08,{'reg_alpha': 0.08},0.843342,0.8426,0.82891,0.864071,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,17.840212,0.134813,0.108866,0.003171,0.06,{'reg_alpha': 0.06},0.835141,0.839823,0.844569,0.854998,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [52]:
param_test = {
  'learning_rate':[0.01, 0.05, 0.1, 0.15, 0.2],
}
    
gsearch = GridSearchCV(estimator=XGBClassifier(
                       colsample_bylevel = 0.5,
                       colsample_bytree = 0.8,
                       gamma = 0.0,
                       learning_rate = 0.1,
                       max_depth = 9,
                       min_child_weight = 1,
                       n_estimators = 81,
                       reg_alpha = 0.0,
                       reg_lambda = 1,
                       scale_pos_weight = 0.9956297179,
                       subsample = 0.8), 
                       param_grid=param_test, 
                       scoring='roc_auc',
                       n_jobs=4, 
                       verbose=4,
                       return_train_score=True,
                       cv=10,
                       refit=True)

train_model_regbase_cv9 = gsearch.fit(X_train, y_train)
pred_regbase_cv9 = train_model_regbase_cv9.predict(X_test)
print("Accuracy_model_cv9: %.4f" % (accuracy_score(y_test, pred_regbase_cv9)))  
print("F1_model_cv9: %.4f" % (f1_score(y_test, pred_regbase_cv9)))  
print("MCC_model_cv9: %.4f" % (matthews_corrcoef(y_test, pred_regbase_cv9)))  
print("ROC_AUC_model_cv9: %.4f" % (roc_auc_score(y_test, pred_regbase_cv9))) 

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Accuracy_model_cv9: 0.7578
F1_model_cv9: 0.7467
MCC_model_cv9: 0.5245
ROC_AUC_model_cv9: 0.7604


In [53]:
gsearch.cv_results_['params'][gsearch.best_index_]

{'learning_rate': 0.05}

In [54]:
pd.DataFrame(gsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,22.323249,0.307227,0.110342,0.002482,0.01,{'learning_rate': 0.01},0.82849,0.832752,0.840824,0.848541,...,0.998206,0.997801,0.998428,0.998421,0.998675,0.997952,0.998565,0.998386,0.998377,0.000303
1,20.917262,0.150973,0.110218,0.001733,0.05,{'learning_rate': 0.05},0.833592,0.838144,0.839597,0.853093,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,17.746929,0.143598,0.109879,0.003299,0.1,{'learning_rate': 0.1},0.840695,0.82933,0.847411,0.858582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,15.47015,0.085574,0.110192,0.003604,0.15,{'learning_rate': 0.15},0.843246,0.826166,0.83201,0.857968,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,13.539695,0.353067,0.108764,0.003243,0.2,{'learning_rate': 0.2},0.828781,0.843504,0.847863,0.849929,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [None]:
# Save default model

In [23]:
from joblib import dump, load

best_estimator = train_model1
dump(best_estimator, path_prefix + "DNABERT_2/Output_Models/" + "model_xpg_boost_PRVCS_CAN_CADD_cons_default.joblib")

['/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_CAN_CADD_cons_default.joblib']

In [22]:
# Save best model

In [30]:
from joblib import dump, load

best_estimator = train_model4.best_estimator_
dump(best_estimator, path_prefix + "DNABERT_2/Output_Models/" + "model_xpg_boost_PRVCS_CAN_CADD_cons.joblib")

['/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_CAN_CADD_cons.joblib']

In [31]:
test_estimator = load('/expanse/lustre/projects/nji102/sgriesmer/DNABERT_2/Output_Models/model_xpg_boost_PRVCS_CAN_CADD_cons.joblib')

In [25]:
test_pred = test_estimator.predict(X_test)

In [26]:
test_pred.sum()/1507

0.44459190444591906

In [27]:
# Load SNP data

In [27]:
SNPlt01_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/lt05_igap_ran_0_10000-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [28]:
SNPlt01_abs_diff_plus_log_odds_scores_norm.shape

(10000, 1380)

In [29]:
SNPlt01_abs_diff_plus_log_odds_scores_norm = SNPlt01_abs_diff_plus_log_odds_scores_norm.iloc[:,0:1380:1]

In [30]:
SNPlt01_abs_diff_plus_log_odds_scores_norm.shape

(10000, 1380)

In [31]:
# Set model names to numbers

In [32]:
SNPlt01_abs_diff_plus_log_odds_scores_norm.set_axis(col_lab, axis="columns", inplace=True)

In [33]:
SNPlt01_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1370,1371,1372,1373,1374,1375,1376,1377,1378,1379
0,-0.134340,-0.176669,-0.167205,1.136512,0.598432,-0.162685,-0.172438,0.035790,-0.143567,-0.154109,...,-0.316542,-0.200087,-0.322255,-0.241135,-0.097094,-0.150122,-0.420258,-0.225923,-0.196953,-0.289094
1,-0.134376,-0.366801,-0.167250,-0.652682,-0.361398,-0.162674,-0.203901,-0.443247,-0.298716,-0.154114,...,0.419565,-0.195666,-0.377331,-0.174015,-0.222661,-0.306294,-0.472069,-0.218917,-0.197477,-0.280357
2,-0.023767,-0.197831,-0.167147,0.070671,-0.200981,0.235543,-0.203821,-0.207416,-0.223960,-0.147906,...,2.071424,-0.186225,-0.038674,0.445475,0.074265,0.001689,-0.210508,-0.193521,-0.186807,0.506437
3,-0.134375,-0.437801,-0.167141,-0.155040,-0.285044,-0.162685,-0.203818,-0.269907,-0.297292,-0.154112,...,-0.319017,-0.198726,-0.384140,-0.275544,-0.037049,-0.276440,-0.447298,-0.224284,-0.196686,-0.247948
4,-0.134354,-0.498539,-0.167261,-0.653013,-0.356434,-0.162685,-0.203906,-0.443459,-0.298768,-0.154109,...,-0.278643,-0.200142,-0.378112,-0.200160,-0.197726,-0.271060,-0.457370,-0.226998,-0.196294,-0.271508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.134333,-0.102992,-0.167079,0.438134,-0.358858,-0.162685,-0.203737,5.375807,-0.297380,-0.154109,...,-0.316421,-0.195129,-0.325719,-0.169471,-0.159812,0.021548,-0.315698,-0.225919,-0.197648,-0.276390
9996,-0.130458,-0.029596,-0.127534,1.780980,-0.002129,-0.162602,-0.203783,-0.264749,0.331581,-0.145322,...,0.118872,-0.152362,-0.317824,0.050219,0.247068,-0.077249,-0.193526,-0.225535,-0.169228,0.006952
9997,-0.134323,-0.276844,-0.167105,-0.526303,-0.351880,-0.162685,-0.198805,-0.370694,-0.266654,-0.154111,...,-0.309555,-0.190732,-0.338031,-0.288699,-0.218013,-0.282265,-0.440474,-0.219109,-0.186410,-0.270539
9998,-0.134193,0.185418,-0.165822,-0.172234,-0.355526,-0.162660,-0.203906,-0.435158,-0.297253,-0.153950,...,-0.224890,-0.168852,-0.143209,-0.260668,-0.245686,-0.070232,-0.192810,-0.107638,-0.195652,0.022638


In [34]:
SNPlt01_pred = test_estimator.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)

In [35]:
SNPlt01_pred[0:10000].sum()

8926

In [36]:
SNPgt5_abs_diff_plus_log_odds_scores_norm = pd.read_csv(path_prefix + "DNABERT_2/output/SNPgt5_ran_10000_19997-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv", sep=',')

In [37]:
SNPgt5_abs_diff_plus_log_odds_scores_norm.set_axis(col_lab, axis="columns", inplace=True)

In [38]:
SNPgt5_abs_diff_plus_log_odds_scores_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1370,1371,1372,1373,1374,1375,1376,1377,1378,1379
0,-0.138665,-0.420140,-0.173807,-0.640132,-0.354017,-0.167706,-0.201053,-0.433488,-0.293240,-0.152419,...,-0.298616,-0.195598,-0.173738,-0.237389,-0.234340,-0.230862,-0.462661,-0.233144,-0.201901,-0.270089
1,-0.138644,-0.336718,-0.173833,-0.386711,-0.357093,-0.167705,-0.201084,-0.408362,-0.291127,-0.152450,...,-0.233454,-0.195493,-0.381789,-0.232173,-0.229542,-0.293351,-0.216017,-0.233435,-0.189338,-0.293258
2,-0.138619,-0.500067,-0.173816,-0.511689,-0.355782,-0.167706,-0.199971,-0.367840,0.130106,-0.152452,...,-0.303600,-0.191994,-0.375076,-0.210616,-0.206243,-0.224043,-0.065949,-0.233740,-0.190216,-0.266444
3,-0.138608,-0.497402,-0.172060,0.577827,-0.337950,-0.166874,0.137476,-0.396060,-0.293746,-0.145622,...,-0.211894,-0.143020,-0.321114,-0.246540,0.221254,-0.208902,-0.173580,-0.234957,-0.200737,-0.177275
4,-0.138651,-0.476752,-0.173588,-0.557592,-0.356994,-0.167706,-0.201089,-0.427758,-0.286041,-0.152443,...,-0.079872,-0.125941,-0.195798,-0.134981,-0.216132,-0.201555,-0.451631,-0.230652,-0.184067,-0.165696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,-0.138425,-0.490933,-0.173668,-0.316845,-0.323496,-0.167704,-0.201039,-0.122288,-0.294265,-0.137659,...,-0.279311,-0.192734,-0.371469,-0.294450,-0.245204,-0.043556,0.851768,-0.233992,-0.189598,-0.273739
9993,-0.138602,2.429663,-0.173467,-0.634431,0.505936,-0.167705,-0.200044,-0.419610,-0.200065,-0.152306,...,-0.309292,-0.178383,-0.195712,-0.249831,-0.199768,-0.278973,-0.463986,-0.232627,-0.192912,-0.211255
9994,-0.137829,-0.501378,-0.173470,-0.477920,-0.293777,-0.167039,-0.201087,-0.416729,-0.262790,-0.150650,...,-0.051780,-0.195559,0.182378,0.146328,-0.019896,0.390730,-0.317636,-0.235096,-0.201179,-0.239532
9995,-0.138670,-0.493843,-0.173832,-0.638591,-0.357319,-0.167706,-0.201088,-0.437744,-0.294248,-0.152451,...,-0.314167,-0.201897,-0.379950,-0.293094,-0.253037,-0.281147,-0.460052,-0.230711,-0.196830,-0.298762


In [39]:
SNPgt5_pred = test_estimator.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)

In [40]:
SNPgt5_pred.sum()/9997

0.854556366910073

In [41]:
# Try other models

In [42]:
pred1_lt01 = train_model1.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm) 
pred2_lt01 = train_model2.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm) 
pred3_lt01 = train_model3.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred4_lt01 = train_model4.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred_deepsea1_lt01 = train_model_deepsea1.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)
pred_deepsea2_lt01 = train_model_deepsea2.predict(SNPlt01_abs_diff_plus_log_odds_scores_norm)

In [47]:
for i in [
    pred1_lt01,
    pred2_lt01,
    pred3_lt01,
    pred4_lt01,
    pred_deepsea1_lt01,
    pred_deepsea2_lt01,
  ]:
    print(i.sum())
    

8987
8224
8677
8926
0
9507


In [45]:
pred1_gt5 = train_model1.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm) 
pred2_gt5 = train_model2.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm) 
pred3_gt5 = train_model3.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred4_gt5 = train_model4.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred_deepsea1_gt5 = train_model_deepsea1.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)
pred_deepsea2_gt5 = train_model_deepsea2.predict(SNPgt5_abs_diff_plus_log_odds_scores_norm)

In [46]:
for i in [
    pred1_gt5,
    pred2_gt5,
    pred3_gt5,
    pred4_gt5,
    pred_deepsea1_gt5,
    pred_deepsea2_gt5,
  ]:
    print(i.sum()/999)
    

9152
7317
8443
8543
0
9560
