## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [1]:
#pip install transformers==4.29.2 scikit-learn einops evaluate accelerate optuna


In [2]:
# uninstall triton in shell

# pip uninstall --yes triton

Set kmer and model parameters

In [1]:
kmer = 6
#model_used = "armheb/DNA_bert_" + str(kmer)
model_used = "zhihan1996/DNABERT-2-117M"
train_bs = 32
eval_bs = 32
epochs = 3
warmup = 104
lr = 3.8e-5
save_steps = 200
eval_steps = 200
save_total_limit = 3

run_name = "run-" + "tfbs-datasets-570"

Set output path

In [2]:
# set output path

path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"
# stdpath = path_prefix + "DNABERT/output/" + "optimization" + "/"

Run model on next 10 datasets

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import optuna
import sys
import os

# set up dataframe to capture testing results

test_column_names = ['test_loss', 'test_accuracy', 'test_precision', 'test_recall',
                     'test_f1', 'test_matthews_correlation', 'test_runtime']
testing_results_df = pd.DataFrame(columns=test_column_names)

# initialize parameters

for fname in [
  "SydhMcf10aesStat3Tam112hHvdUniPk151-ran.csv"
]:

  dsname = path_prefix + "DNABERT_2/Datasets/tfbs/" + fname
  tfbs_dataset = pd.read_csv(dsname, sep=',')

# change standard error and output to saved output file

  stdpath = path_prefix + "DNABERT_2/output/" + fname.split(".")[0] + "/"

  try:
    os.makedirs(stdpath)
  except FileExistsError:
    pass

  stdpath_out = stdpath + "stdout.txt"
  stdpath_err = stdpath + "stderr.txt"

  sys.stdout = open(stdpath_out, 'w')
  sys.stderr = open(stdpath_err, 'w')

# print dataset shape and initial values

  print("tfbs dataset:", tfbs_dataset.shape, file=sys.stdout)
  print("tfbs dataset initial values:", tfbs_dataset.head(), file=sys.stdout)

# reformat input

  column_names = ["labels", "seq"]
  tfbs_dataset_res = pd.DataFrame(columns=column_names)

  j = 1
  for i in range(tfbs_dataset.count()['names']):
    name = tfbs_dataset['names'][i]
    pos_seq = tfbs_dataset['seq'][i]
    neg_seq = tfbs_dataset['neg_seq'][i]
    pos_label = 1
    neg_label = 0
    tfbs_dataset_res.loc[j] = [pos_label, pos_seq]
    tfbs_dataset_res.loc[j+1] = [neg_label, neg_seq]
    j+=2

  X = tfbs_dataset_res['seq']
  y = tfbs_dataset_res['labels']
  X_size = len(X)

# test range is 500 odd-numbered pairs of sequences
# training range is 500 even-numbered pairs plus remainder of the sequences over 1000
# if number of sequences is less than 1000, then give test 20% of sequences and training 80%

  if X_size < 2001:
    range_end = int(min(X_size, 2001)*0.4)
  else:
    range_end = 2001

  test_range = list(range(3,range_end,4)) + list(range(4,range_end,4))
  test_range.sort()
  train_range = list(range(1,range_end,4)) + list(range(2,range_end,4))
  train_range.sort()
  train_range = train_range + list(range(range_end,X_size))

  X_test = X.loc[test_range]
  X_train = X.loc[train_range]
  y_test = y.loc[test_range]
  y_train = y.loc[train_range]

# print size of training and test sets

  print("X_train shape:", X_train.shape, file=sys.stdout)
  print("y_train shape:", y_train.shape, file=sys.stdout)
  print("X_test shape:", X_test.shape, file=sys.stdout)
  print("y_test shape:", y_test.shape, file=sys.stdout)

# load pre-trained model

  model_cls = AutoModelForSequenceClassification.from_pretrained(model_used, num_labels=2, trust_remote_code=True)
  params = list(model_cls.named_parameters())
  tokenizer = AutoTokenizer.from_pretrained(model_used, model_max_length=30, padding_side="right", use_fast=True, trust_remote_code=True)

# reformat data to Hugging Face Dataset format from pandas

  ds_Xy_train = pd.concat([y_train, X_train], axis=1)
  ds_Xy_test = pd.concat([y_test, X_test], axis=1)

  Dataset_Xy_train = Dataset.from_pandas(ds_Xy_train)
  Dataset_Xy_test = Dataset.from_pandas(ds_Xy_test)
  Dataset_Xy_train, Dataset_Xy_test

  def tokenize(batch):
    return tokenizer(batch["seq"], return_tensors="pt", padding='longest', max_length=30, truncation=True)

  Dataset_Xy_train_tok = Dataset_Xy_train.map(tokenize, batched=True, batch_size=None)
  new_column = ["train"] * len(Dataset_Xy_train_tok)
  Dataset_Xy_train_tok = Dataset_Xy_train_tok.add_column("dset", new_column)

  Dataset_Xy_test_tok = Dataset_Xy_test.map(tokenize, batched=True, batch_size=None)
  new_column = ["test"] * len(Dataset_Xy_test_tok)
  Dataset_Xy_test_tok = Dataset_Xy_test_tok.add_column("dset", new_column)

  print(Dataset_Xy_train_tok['input_ids'][:2], file=sys.stdout)
  print(Dataset_Xy_train_tok['attention_mask'][:2], file=sys.stdout)

  dds = DatasetDict({
    'train': Dataset_Xy_train_tok,
    'test': Dataset_Xy_test_tok
  })

# switch to GPU

  model_cls = model_cls.to('cuda')

# train model

  output_dir = path_prefix + 'outputs'

  args = TrainingArguments(output_dir, learning_rate=lr, warmup_steps=warmup, fp16=True,
    evaluation_strategy="steps", per_device_train_batch_size=train_bs, per_device_eval_batch_size=eval_bs,
    eval_steps=eval_steps, save_steps=save_steps, logging_steps=100000, save_total_limit=save_total_limit,
    load_best_model_at_end=True, metric_for_best_model="eval_matthews_correlation", num_train_epochs=epochs, weight_decay=0.01, report_to='none')

  def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy","precision","recall","f1","matthews_correlation"])
    logits, labels = eval_preds
    if isinstance(logits, tuple):  # Unpack logits if it's a tuple
      logits = logits[0]
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

  trainer.train();

# save model

  fpath = path_prefix + "DNABERT_2/Output_Models/" + fname.split(".")[0] + "/"
  print(fpath, file=sys.stdout)
  trainer.save_model(fpath)

# test model

  eval_preds = trainer.predict(dds['test'])

  print(eval_preds, file=sys.stdout)

  tfbs_ds = fname.split(".")[0] 
  testing_results_df.loc[tfbs_ds] = [eval_preds.metrics['test_loss'],
                                  eval_preds.metrics['test_accuracy'],
                                  eval_preds.metrics['test_precision'],
                                  eval_preds.metrics['test_recall'],
                                  eval_preds.metrics['test_f1'],
                                  eval_preds.metrics['test_matthews_correlation'],
                                  eval_preds.metrics['test_runtime']]

outpath = path_prefix + "DNABERT_2/output/" + run_name + ".csv"
testing_results_df.to_csv(outpath)




Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly ini

Map:   0%|          | 0/78815 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Matthews Correlation
200,No log,0.425756,0.804,0.745161,0.924,0.825,0.626305
400,No log,0.358138,0.842,0.773163,0.968,0.85968,0.706811
600,No log,0.331291,0.874,0.814815,0.968,0.884826,0.76158
800,No log,0.236222,0.897,0.852575,0.96,0.903104,0.800379
1000,No log,0.241238,0.916,0.889513,0.95,0.918762,0.83393
1200,No log,0.250799,0.902,0.853873,0.97,0.90824,0.81154
1400,No log,0.189577,0.929,0.910134,0.952,0.930596,0.858909
1600,No log,0.209358,0.923,0.886654,0.97,0.926457,0.849763
1800,No log,0.195008,0.922,0.876786,0.982,0.926415,0.850143
2000,No log,0.189332,0.933,0.917148,0.952,0.934249,0.866626


##### testing_results_df

In [5]:
testing_results_df

Unnamed: 0,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_matthews_correlation,test_runtime
SydhGm08714Znf274UcdUniPk151-ran,0.273001,0.890977,0.948276,0.827068,0.883534,0.788422,3.0579
SydhGm12878Brca1a300IggmusUniPk151-ran,0.204711,0.949772,0.909091,1.0,0.952381,0.904073,3.145
SydhGm12878Nfe2sc22827UniPk151-ran,0.130134,0.977273,0.993289,0.961039,0.976898,0.955049,3.0996
SydhGm12878Pol3UniPk151-ran,0.648967,0.771084,0.780488,0.761905,0.771084,0.542393,2.8745
SydhGm12878Znf274UniPk151-ran,0.668913,0.576087,0.544304,0.934783,0.688,0.21843,3.228
SydhGm12878Zzz3UniPk151-ran,0.210717,0.93662,0.955882,0.915493,0.935252,0.87402,3.4508
SydhHelas3Bdp1UniPk151-ran,0.468142,0.891089,0.869159,0.920792,0.894231,0.783562,2.9145
SydhHelas3Brf1UniPk151-ran,0.66987,0.723684,0.717949,0.736842,0.727273,0.447523,2.9153
SydhHelas3Brf2UniPk151-ran,0.687634,0.533333,0.517544,0.983333,0.678161,0.152944,2.8904
SydhHelas3Znf143IggrabUniPk151-ran,0.213545,0.932331,0.909953,0.96,0.934307,0.86596,3.1333
