<a href="https://colab.research.google.com/github/sgriesmer/njit-cad/blob/main/Testing_DNABERT_Classifier_With_TFBS_Datasets_streamlined_sjg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [1]:
!pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m et

Access TFBS dataset in Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Run model on next 10 datasets

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric
from transformers import TrainingArguments, Trainer
import numpy as np
import sys
import os

# initialize parameters

for fname in ["BroadK562Ezh239875UniPk.csv"]:

  dsname = "/content/drive/MyDrive/DNABERT/Datasets/tfbs/" + fname
  tfbs_dataset = pd.read_csv(dsname, sep=',')

# print dataset shape and initial values

  print("tfbs dataset:", tfbs_dataset.shape, file=sys.stdout)
  print("tfbs dataset initial values:", tfbs_dataset.head(), file=sys.stdout)

# reformat input

  column_names = ["labels", "seq"]
  tfbs_dataset_res = pd.DataFrame(columns=column_names)

  j = 1
  for i in range(tfbs_dataset.count()['names']):
    name = tfbs_dataset['names'][i]
    pos_seq = tfbs_dataset['seq'][i]
    neg_seq = tfbs_dataset['neg_seq'][i]
    pos_label = 1
    neg_label = 0
    tfbs_dataset_res.loc[j] = [pos_label, pos_seq]
    tfbs_dataset_res.loc[j+1] = [neg_label, neg_seq]
    j+=2

  X = tfbs_dataset_res['seq']
  y = tfbs_dataset_res['labels']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# print size of training and test sets

  print("X_train shape:", X_train.shape, file=sys.stdout)
  print("y_train shape:", y_train.shape, file=sys.stdout)
  print("X_test shape:", X_test.shape, file=sys.stdout)
  print("y_test shape:", y_test.shape, file=sys.stdout)

# tokenization

  def kmers_stride1(s, k=6):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

# load pre-trained model

  model_path = "/content/drive/MyDrive/DNABERT/Output_Models/" + fname.split(".")[0] + "/"
  model_cls = AutoModelForSequenceClassification.from_pretrained(model_path)
  tokenizer = AutoTokenizer.from_pretrained(model_path)

# reformat data to Hugging Face Dataset format from pandas

  ds_Xy_train = pd.concat([y_train, X_train], axis=1)
  ds_Xy_test = pd.concat([y_test, X_test], axis=1)

  Dataset_Xy_train = Dataset.from_pandas(ds_Xy_train)
  Dataset_Xy_test = Dataset.from_pandas(ds_Xy_test)
  Dataset_Xy_train, Dataset_Xy_test

  def tok_func(x): return tokenizer(" ".join(kmers_stride1(x["seq"])))

  Dataset_Xy_train_tok = Dataset_Xy_train.map(tok_func, batched=False)
  new_column = ["train"] * len(Dataset_Xy_train_tok)
  Dataset_Xy_train_tok = Dataset_Xy_train_tok.add_column("dset", new_column)

  Dataset_Xy_test_tok = Dataset_Xy_test.map(tok_func, batched=False)
  new_column = ["test"] * len(Dataset_Xy_test_tok)
  Dataset_Xy_test_tok = Dataset_Xy_test_tok.add_column("dset", new_column)

  dds = DatasetDict({
    'train': Dataset_Xy_train_tok,
    'test': Dataset_Xy_test_tok
  })

# switch to GPU

  if torch.cuda.device_count() > 0:
    model_cls.to('cuda')

# load model

  bs = 32
  epochs = 4
  lr = 8e-5

  args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

  def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

# test model

  eval_preds = trainer.predict(dds['test'])

  print(eval_preds, file=sys.stdout)

# print probabilities

  from scipy.special import softmax

  prob_predictions = softmax(eval_preds[0], axis=1)
  print(prob_predictions[0:5], file=sys.stdout)
  print(eval_preds.predictions[0:5], file=sys.stdout)
  print(eval_preds.label_ids[0:5], file=sys.stdout)



tfbs dataset: (1685, 4)
tfbs dataset initial values:                 Unnamed: 0                    names  \
0    chr19:4342852-4342952    chr19:4342852-4342952   
1  chr17:79818633-79818733  chr17:79818633-79818733   
2  chr10:22611253-22611353  chr10:22611253-22611353   
3  chr14:58765050-58765150  chr14:58765050-58765150   
4  chr12:26277584-26277684  chr12:26277584-26277684   

                                                 seq  \
0  AAGGGTTTCTACTGATTGGTTTACATGGACGTCTGCCCATTGGTCA...   
1  ATTGGCCGAGGACCTCGCGCTCGCTGTCTTTCTTAGCCTCTCATTG...   
2  TCCCGCCAAGGTGGGTGTTAGGCTGGAGAGAGCCCCGACGAAAATG...   
3  CTTACTCAATAACCAACTCTCCAATAAAGTTGGTTTTCGGAAAAAG...   
4  CTTTCTGGAGAAGAAAAAAATCAAACCAAAGCCTAGACAGATATTC...   

                                             neg_seq  
0  ACGGGTGTCTTTGGGCCGTGAAGGTGATTTACATTCAACCCTAGGT...  
1  AATCTTGATCAGCGATTGGCACTAACCCGCTTCTGGGCTTCGGCTT...  
2  TGCCGTGATACTCGTAGATTGCTTAATTTAAAACTGAAGTGTGAGA...  
3  CAACTTATCAGTCTTAAGAATGTAGGTGCAGGTCGAGGTCCTTCAC...  
4  C

Map:   0%|          | 0/2527 [00:00<?, ? examples/s]

Map:   0%|          | 0/843 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


PredictionOutput(predictions=array([[-1.8984375 ,  2.2792969 ],
       [ 1.0332031 , -0.38134766],
       [-0.10705566,  0.26538086],
       ...,
       [-1.03125   ,  1.7275391 ],
       [-1.0263672 ,  1.3193359 ],
       [ 0.04772949,  0.5097656 ]], dtype=float32), label_ids=array([1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1

In [4]:
  from scipy.special import softmax

  prob_predictions = softmax(preds_output[0], axis=1)

  import tensorflow as tf

  prob_predictions = tf.math.softmax(eval_preds.predictions.logits, axis=-1)
  print(prob_predictions, file=sys.stdout)

AttributeError: ignored