## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [3]:
!pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] evaluate scikit-learn optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m104.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.6 MB/s[0m 

Access TFBS dataset in Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Set kmer and model parameters

In [5]:
kmer = 6
model_used = "armheb/DNA_bert_" + str(kmer)
bs = 64
epochs = 4
warmup = 100
maxstp = 1000
lr = 8e-5

Testing parameter changes on 3 datasets

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import optuna
import sys
import os

# set output path

stdpath = "/content/drive/MyDrive/DNABERT/output/" + "optimization" + "/"

# pick a few TFBS datasets with varying accuracy

fname = "HaibA549Yy1cV0422111Etoh02UniPk-ran.csv"

dsname = "/content/drive/MyDrive/DNABERT/Datasets/tfbs/" + fname
tfbs_dataset = pd.read_csv(dsname, sep=',')

# change standard error and output to saved output file

try:
  os.makedirs(stdpath)
except FileExistsError:
  pass

stdpath_out = stdpath + "stdout.txt"
stdpath_err = stdpath + "stderr.txt"

sys.stdout = open(stdpath_out, 'w')
sys.stderr = open(stdpath_err, 'w')

# reformat input

column_names = ["labels", "seq"]
tfbs_dataset_res = pd.DataFrame(columns=column_names)

j = 1
for i in range(tfbs_dataset.count()['names']):
  name = tfbs_dataset['names'][i]
  pos_seq = tfbs_dataset['seq'][i]
  neg_seq = tfbs_dataset['neg_seq'][i]
  pos_label = 1
  neg_label = 0
  tfbs_dataset_res.loc[j] = [pos_label, pos_seq]
  tfbs_dataset_res.loc[j+1] = [neg_label, neg_seq]
  j+=2

X = tfbs_dataset_res['seq']
y = tfbs_dataset_res['labels']
X_size = len(X)

# test range is 500 odd-numbered pairs of sequences
# training range is 500 even-numbered pairs plus remainder of the sequences over 1000

test_range = list(range(3,1001,4)) + list(range(4,1001,4))
test_range.sort()
train_range = list(range(1,1001,4)) + list(range(2,1001,4))
train_range.sort()
train_range = train_range + list(range(1001,X_size))

X_test = X.loc[test_range]
X_train = X.loc[train_range]
y_test = y.loc[test_range]
y_train = y.loc[train_range]

# print size of training and test sets

print("X_train shape:", X_train.shape, file=sys.stdout)
print("y_train shape:", y_train.shape, file=sys.stdout)
print("X_test shape:", X_test.shape, file=sys.stdout)
print("y_test shape:", y_test.shape, file=sys.stdout)

# tokenization

def kmers_stride1(s, k=kmer):
  return [s[i:i + k] for i in range(0, len(s)-k+1)]




Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/20017 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5999,0.540353,0.752,0.677083
2,0.4335,0.41395,0.802,0.811429
3,0.3555,0.484349,0.808,0.825455
4,0.2298,0.481835,0.824,0.834586


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13803 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.647629,0.664,0.671875
2,0.639500,0.44124,0.816,0.809917
3,0.481500,0.481553,0.78,0.800725
4,0.359400,0.573276,0.774,0.796396


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_4 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/45883 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1938,0.17648,0.942,0.94347
2,0.1086,0.186381,0.956,0.957031
3,0.07,0.167488,0.96,0.960784
4,0.0554,0.182025,0.958,0.958904


In [None]:
# set optuna hyperparameter trial space

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
        "warmup_steps": trial.suggest_int("warmup_steps", 50, 150)
    }

In [None]:
# define model initialization

def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(model_used, num_labels=2)

In [None]:
# load pre-trained model

model_cls = AutoModelForSequenceClassification.from_pretrained(model_used, num_labels=2)
params = list(model_cls.named_parameters())
tokenizer = AutoTokenizer.from_pretrained(model_used)

# reformat data to Hugging Face Dataset format from pandas

ds_Xy_train = pd.concat([y_train, X_train], axis=1)
ds_Xy_test = pd.concat([y_test, X_test], axis=1)

Dataset_Xy_train = Dataset.from_pandas(ds_Xy_train)
Dataset_Xy_test = Dataset.from_pandas(ds_Xy_test)
Dataset_Xy_train, Dataset_Xy_test

def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

Dataset_Xy_train_tok = Dataset_Xy_train.map(tok_func, batched=False)
new_column = ["train"] * len(Dataset_Xy_train_tok)
Dataset_Xy_train_tok = Dataset_Xy_train_tok.add_column("dset", new_column)

Dataset_Xy_test_tok = Dataset_Xy_test.map(tok_func, batched=False)
new_column = ["test"] * len(Dataset_Xy_test_tok)
Dataset_Xy_test_tok = Dataset_Xy_test_tok.add_column("dset", new_column)

dds = DatasetDict({
  'train': Dataset_Xy_train_tok,
  'test': Dataset_Xy_test_tok
})

# switch to GPU

if torch.cuda.device_count() > 0:
  model_cls.to('cuda')

# train model

args = TrainingArguments('outputs', learning_rate=lr, warmup_steps=warmup, max_steps=maxstp, lr_scheduler_type='cosine', fp16=True,
                         evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs, weight_decay=0.01, report_to='none')

def compute_metrics(eval_preds):
  metric = evaluate.combine(["accuracy","precision","recall","f1","matthews_correlation"])
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model=None, args=args, train_dataset=dds['train'], eval_dataset=dds['test'],tokenizer=tokenizer, compute_metrics=compute_metrics, model_init=model_init)


In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20
)

In [None]:
best_trial

In [None]:
best_trial.hyperparameters