## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [12]:
# already set up on Expanse
# pip install Bio transformers genomic-benchmarks datasets transformers[torch] evaluate scikit-learn

Set kmer and model parameters

In [1]:
kmer = 6
#model_used = "armheb/DNA_bert_" + str(kmer)
model_used = "zhihan1996/DNABERT-2-117M"
train_bs = 8
eval_bs = 64
epochs = 3
warmup = 30
lr = 3e-5
save_steps = 200
eval_steps = 200
save_total_limit = 3



run_name = "run-" + "gue-test-db2-git-19"

Set output path

In [2]:
# set output path

path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"
# stdpath = path_prefix + "DNABERT/output/" + "optimization" + "/"

Run model on next 10 datasets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import sys
import os

# capture testing results

test_column_names = ['test_loss', 'test_accuracy', 'test_precision', 'test_recall',
                     'test_f1', 'test_matthews_correlation', 'test_runtime']
testing_results_df = pd.DataFrame(columns=test_column_names)

# initialize parameters

for fname in ["0","1","2","3","4"]:
  train_dsname = path_prefix + "DNABERT/GUE/tf/" + fname + "/" + "train.csv"
  test_dsname = path_prefix + "DNABERT/GUE/tf/" + fname + "/" + "test.csv"
  train_set = pd.read_csv(train_dsname)
  test_set = pd.read_csv(test_dsname)

# change standard error and output to saved output file

  stdpath = path_prefix + "DNABERT/output/" + fname.split(".")[0] + "/"

  try:
    os.makedirs(stdpath)
  except FileExistsError:
    pass

  stdpath_out = stdpath + "stdout.txt"
  stdpath_err = stdpath + "stderr.txt"

  sys.stdout = open(stdpath_out, 'w')
  sys.stderr = open(stdpath_err, 'w')

# reformat input

  X_train = train_set['sequence']
  y_train = train_set['label']
  X_test = test_set['sequence']
  y_test = test_set['label']

# print size of training and test sets

  print("X_train shape:", X_train.shape, file=sys.stdout)
  print("y_train shape:", y_train.shape, file=sys.stdout)
  print("X_test shape:", X_test.shape, file=sys.stdout)
  print("y_test shape:", y_test.shape, file=sys.stdout)

# load pre-trained model

  model_cls = AutoModelForSequenceClassification.from_pretrained(model_used, num_labels=2)
  params = list(model_cls.named_parameters())
  tokenizer = AutoTokenizer.from_pretrained(model_used, model_max_length=30, padding_side="right", use_fast=True)

# reformat data to Hugging Face Dataset format from pandas

  ds_Xy_train = pd.concat([y_train, X_train], axis=1)
  ds_Xy_test = pd.concat([y_test, X_test], axis=1)

  Dataset_Xy_train = Dataset.from_pandas(ds_Xy_train)
  Dataset_Xy_test = Dataset.from_pandas(ds_Xy_test)
  Dataset_Xy_train, Dataset_Xy_test

  def tokenize(batch):
    return tokenizer(batch["sequence"], return_tensors="pt", padding='longest', max_length=30, truncation=True)

  Dataset_Xy_train_tok = Dataset_Xy_train.map(tokenize, batched=True, batch_size=None)
  new_column = ["train"] * len(Dataset_Xy_train_tok)
  Dataset_Xy_train_tok = Dataset_Xy_train_tok.add_column("dset", new_column)

  Dataset_Xy_test_tok = Dataset_Xy_test.map(tokenize, batched=True, batch_size=None)
  new_column = ["test"] * len(Dataset_Xy_test_tok)
  Dataset_Xy_test_tok = Dataset_Xy_test_tok.add_column("dset", new_column)

  print(Dataset_Xy_train_tok['input_ids'][:2], file=sys.stdout)
  print(Dataset_Xy_train_tok['attention_mask'][:2], file=sys.stdout)
  
  dds = DatasetDict({
    'train': Dataset_Xy_train_tok,
    'test': Dataset_Xy_test_tok
  })

# switch to GPU

  if torch.cuda.device_count() > 0:
    model_cls.to('cuda')

# train model

  output_dir = path_prefix + 'outputs'

  args = TrainingArguments(output_dir, learning_rate=lr, warmup_steps=warmup, fp16=True,
    evaluation_strategy="steps", per_device_train_batch_size=train_bs, per_device_eval_batch_size=eval_bs, 
    eval_steps=eval_steps, save_steps=save_steps, logging_steps=100000, save_total_limit=save_total_limit,
    load_best_model_at_end=True, num_train_epochs=epochs, weight_decay=0.01, report_to='none')

  def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy","precision","recall","f1","matthews_correlation"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

  trainer.train();

# save model

  fpath = path_prefix + "DNABERT/Output_Models/" + fname.split(".")[0] + "/"
  print(fpath, file=sys.stdout)
  trainer.save_model(fpath)

# test model

  eval_preds = trainer.predict(dds['test'])

  print(eval_preds, file=sys.stdout)

  tfbs_ds = fname.split(".")[0] 
  testing_results_df.loc[tfbs_ds] = [eval_preds.metrics['test_loss'],
                                  eval_preds.metrics['test_accuracy'],
                                  eval_preds.metrics['test_precision'],
                                  eval_preds.metrics['test_recall'],
                                  eval_preds.metrics['test_f1'],
                                  eval_preds.metrics['test_matthews_correlation'],
                                  eval_preds.metrics['test_runtime']]

outpath = path_prefix + "DNABERT/output/" + run_name + ".csv"
testing_results_df.to_csv(outpath)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.encoder.layer.8.intermediate.dense.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.5.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.la

Map:   0%|          | 0/32378 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.522505,0.755,0.811736,0.664,0.730473,0.518662
400,No log,0.496751,0.772,0.735294,0.85,0.788497,0.550743
600,No log,0.626322,0.748,0.824607,0.63,0.714286,0.510418
800,No log,0.510778,0.753,0.677918,0.964,0.796036,0.558132
1000,No log,0.559896,0.694,0.621859,0.99,0.763889,0.481427
1200,No log,0.515316,0.782,0.734219,0.884,0.802178,0.576115
1400,No log,0.495993,0.769,0.713154,0.9,0.795756,0.557474
1600,No log,0.514504,0.766,0.706522,0.91,0.795455,0.555538
1800,No log,0.549182,0.736,0.662534,0.962,0.784666,0.529137
2000,No log,0.509221,0.785,0.730956,0.902,0.80752,0.586277


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.encoder.layer.8.intermediate.dense.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.5.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.la

Map:   0%|          | 0/30672 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.584368,0.739,0.667133,0.954,0.785185,0.529447
400,No log,0.495194,0.771,0.701937,0.942,0.804441,0.57678
600,No log,0.477078,0.814,0.810277,0.82,0.815109,0.628045
800,No log,0.530672,0.757,0.68227,0.962,0.79834,0.563544
1000,No log,0.686841,0.494,0.0,0.0,0.0,-0.077693
1200,No log,0.597628,0.742,0.665301,0.974,0.790584,0.546377
1400,No log,0.552032,0.718,0.64418,0.974,0.775478,0.507576
1600,No log,0.49833,0.805,0.757167,0.898,0.821592,0.620834
1800,No log,0.527511,0.776,0.707831,0.94,0.80756,0.584326
2000,No log,0.543411,0.733,0.66069,0.958,0.782041,0.52182


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.encoder.layer.8.intermediate.dense.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.5.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.la

Map:   0%|          | 0/19000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.601647,0.7,0.647929,0.876,0.744898,0.42735
400,No log,0.628153,0.668,0.609091,0.938,0.738583,0.399209
600,No log,0.652305,0.689,0.631068,0.91,0.745291,0.421398
800,No log,0.568397,0.715,0.714571,0.716,0.715285,0.430001
1000,No log,0.589565,0.715,0.717172,0.71,0.713568,0.430022
1200,No log,0.565976,0.723,0.687395,0.818,0.747032,0.454275
1400,No log,0.556491,0.729,0.678072,0.872,0.762905,0.477965
1600,No log,0.611457,0.712,0.659639,0.876,0.752577,0.44883
1800,No log,0.565338,0.706,0.7575,0.606,0.673333,0.420496
2000,No log,0.572634,0.73,0.712963,0.77,0.740385,0.461479


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.encoder.layer.8.intermediate.dense.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.5.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.la

Map:   0%|          | 0/27294 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.713241,0.5,0.5,1.0,0.666667,0.0
400,No log,0.704836,0.5,0.0,0.0,0.0,0.0
600,No log,0.706603,0.5,0.0,0.0,0.0,0.0
800,No log,0.696534,0.5,0.0,0.0,0.0,0.0
1000,No log,0.703607,0.5,0.5,1.0,0.666667,0.0
1200,No log,0.696996,0.5,0.0,0.0,0.0,0.0
1400,No log,0.693869,0.5,0.0,0.0,0.0,0.0
1600,No log,0.701174,0.5,0.0,0.0,0.0,0.0
1800,No log,0.696386,0.5,0.5,1.0,0.666667,0.0
2000,No log,0.711295,0.5,0.5,1.0,0.666667,0.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.encoder.layer.8.intermediate.dense.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.5.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.11.output.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.la

Map:   0%|          | 0/19000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.505107,0.777,0.776447,0.778,0.777223,0.554001
400,No log,0.539216,0.746,0.677233,0.94,0.78727,0.53382
600,No log,0.523685,0.765,0.712,0.89,0.791111,0.547382
800,No log,0.579821,0.77,0.704545,0.93,0.801724,0.56997
1000,No log,0.481087,0.785,0.843373,0.7,0.765027,0.578419
1200,No log,0.541157,0.767,0.832918,0.668,0.741398,0.544786
1400,No log,0.471018,0.8,0.776753,0.842,0.808061,0.602128
1600,No log,0.468144,0.806,0.771277,0.87,0.817669,0.617076
1800,No log,0.472468,0.804,0.760274,0.888,0.819188,0.616766
2000,No log,0.44812,0.8,0.819149,0.77,0.793814,0.601083
