## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [None]:
# already set up on Expanse; toggle for colab

# pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] pyfaidx

Set path prefix for Expanse

In [None]:
path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

Import SNP datasets

In [None]:
import pandas as pd

snp_lt_05 = pd.read_csv(path_prefix + "DNABERT_2/Datasets/CAD/SNPlt05.vcf", sep='\t')
snp_lt_01 = pd.read_csv(path_prefix + "DNABERT_2/Datasets/CAD/SNPlt01.vcf", sep='\t')
snp_lt_001 = pd.read_csv(path_prefix + "DNABERT_2/Datasets/CAD/SNPlt001.vcf", sep='\t')
snp_gt_5 = pd.read_csv(path_prefix + "DNABERT_2/Datasets/CAD/SNPgt5.vcf", sep='\t')
snp_lt_001.head(), snp_lt_001.shape, snp_lt_01.head(), snp_lt_01.shape, snp_lt_05.head(), snp_lt_05.shape, snp_gt_5.head(), snp_gt_5.shape



Choose test set for run

In [None]:
snp_test = snp_gt_5
snp_start = 10000
snp_end = 20000
snp_test_name = "SNPlt01" + '_' + str(snp_start) + '_' + str(snp_end)
#snp_test = snp_gt_5
#snp_test_name = "SNPgt5"

Import reference genome

In [None]:
from pyfaidx import Fasta

ref_genome = Fasta(path_prefix + "selene/selene_quickstart_tutorial/male.hg19.fasta")
ref_genome["chr1"]

Generate reference and alternative sequences from alleles and save as csv file.

In [None]:
# create a dataset

column_names = ["names", "ref_seq", "alt_seq"]
snp_seq_dataset = pd.DataFrame(columns=column_names)

# create reference and alternate sequences

seq_len = 75
for i,snp in enumerate(snp_test["Name"][snp_start:snp_end]):
  chrom = "chr" + str(snp_test["Chromosome"][i])
  pos = snp_test["Position"][i]
  ref_allele = snp_test["Reference"][i]
  alt_allele = snp_test["Alternative"][i]
  ref_gen_sequence = ref_genome[chrom][int(pos)-seq_len-1:int(pos)+2*seq_len].seq
  if ref_gen_sequence[seq_len:seq_len+len(ref_allele)] == ref_allele:
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1]
  elif ref_gen_sequence[seq_len:seq_len+len(alt_allele)] == alt_allele:
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1]
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
  else:
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1] 
  if len(ref_sequence) != len(alt_sequence):
    print(i, "!!mismatch!!")
  # make uppercase

  ref_sequence = ref_sequence.upper()
  alt_sequence = alt_sequence.upper()


  # write into dataset

  snp_seq_dataset.loc[i] = [snp, ref_sequence, alt_sequence]

Transform into Hugging Face Dataset for prediction

In [None]:
from datasets import Dataset, DatasetDict, load_metric

Dataset_snp_seq = Dataset.from_pandas(snp_seq_dataset)

In [None]:
Dataset_snp_seq

Generate predictions on each TFBS feature for reference and alterative sequences

Make dataset to store predictions for all TFBS feature models

In [None]:
# create datasets

column_names = ["TFBS dataset"]
prob_predictions_ref_dataset = pd.DataFrame(columns=column_names)
prob_predictions_ref_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_alt_dataset = pd.DataFrame(columns=column_names)
prob_predictions_alt_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_diff_dataset = pd.DataFrame(columns=column_names)
prob_predictions_diff_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_odds_dataset = pd.DataFrame(columns=column_names)
prob_predictions_odds_dataset["TFBS dataset"] = Dataset_snp_seq["names"]


In [None]:
kmer = 6
#model_used = "armheb/DNA_bert_" + str(kmer)
model_used = "zhihan1996/DNABERT-2-117M"
train_bs = 32
eval_bs = 32
epochs = 3
warmup = 104
lr = 3.8e-5
save_steps = 200
eval_steps = 200
save_total_limit = 3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np
import sys
import os

# initialize parameters

for fname in [
  "BroadDnd41CtcfUniPk151-ran.csv"
  ]:

  print("dataset:", fname)

  dsname = path_prefix + "DNABERT_2/Datasets/tfbs/" + fname
  tfbs_dataset = pd.read_csv(dsname, sep=',')

# load pre-trained model

  model_path = path_prefix + "DNABERT_2/Output_Models/" + fname.split(".")[0]

  model_cls = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=30, padding_side="right", use_fast=True, trust_remote_code=True)

# reformat data to Hugging Face Dataset format from pandas

  def tokenize_ref(batch):
    return tokenizer(batch["ref_seq"], return_tensors="pt", padding='longest', max_length=30, truncation=True)
  def tokenize_alt(batch):
    return tokenizer(batch["alt_seq"], return_tensors="pt", padding='longest', max_length=30, truncation=True)


  Dataset_snp_seq_tok_ref = Dataset_snp_seq.map(tokenize_ref, batched=True, batch_size=None)

  Dataset_snp_seq_tok_alt = Dataset_snp_seq.map(tokenize_alt, batched=True, batch_size=None)

  dds = DatasetDict({
    'eval_ref': Dataset_snp_seq_tok_ref,
    'eval_alt': Dataset_snp_seq_tok_alt
  })

# switch to GPU

  model_cls = model_cls.to('cuda')

# load model

#  output_dir = path_prefix + 'outputs'

#  args = TrainingArguments(output_dir, learning_rate=lr, warmup_steps=warmup, fp16=True,
#    evaluation_strategy="steps", per_device_train_batch_size=train_bs, per_device_eval_batch_size=eval_bs,
#    eval_steps=eval_steps, save_steps=save_steps, logging_steps=100000, save_total_limit=save_total_limit,
#    load_best_model_at_end=True, metric_for_best_model="eval_matthews_correlation", num_train_epochs=epochs, weight_decay=0.01, report_to='none')

#  def compute_metrics(eval_preds):
#    metric = evaluate.combine(["accuracy","precision","recall","f1","matthews_correlation"])
#    logits, labels = eval_preds
#    if isinstance(logits, tuple):  # Unpack logits if it's a tuple
#      logits = logits[0]
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)

#  trainer = Trainer(model_cls, args, eval_dataset=dds['eval_ref'],
#                  tokenizer=tokenizer, compute_metrics=compute_metrics)

  trainer = Trainer(model_cls)

  eval_preds_ref = trainer.predict(dds['eval_ref'])

# predictions from alternative alleles

  eval_preds_alt = trainer.predict(dds['eval_alt'])

# find reference probabilities

  from scipy.special import softmax

  prob_predictions_ref = softmax(eval_preds_ref.predictions[0], axis=1)
  pos_prob_predictions_ref = prob_predictions_ref[:, 1]

# find alternative probabilities

  prob_predictions_alt = softmax(eval_preds_alt.predictions[0], axis=1)
  pos_prob_predictions_alt = prob_predictions_alt[:, 1]

# print probabilities into dataset

  prob_predictions_ref_dataset[fname.split(".")[0]] = pos_prob_predictions_ref

  prob_predictions_alt_dataset[fname.split(".")[0]] = pos_prob_predictions_alt

print(prob_predictions_ref_dataset)

print(prob_predictions_alt_dataset)

# Drop SNP names from datasets to subtract them

snp_names = prob_predictions_ref_dataset["TFBS dataset"]

prob_predictions_ref_dataset_nosnp = prob_predictions_ref_dataset.drop("TFBS dataset", axis=1)
prob_predictions_alt_dataset_nosnp = prob_predictions_alt_dataset.drop("TFBS dataset", axis=1)

# Find the differences between the two sets and take the absolute value

prob_predictions_diff_dataset_nosnp = prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp

prob_predictions_absdiff_dataset_nosnp = abs(prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp
                                             
# Find max absolute difference in probabilities across TFs
                                             
max_absdiff_prob_predictions = prob_predictions_absdiff_dataset_nosnp.max(axis=1)
max_absdiff_prob_predictions_df = pd.DataFrame(columns=["snp", "max_absdiff_prob"])
max_absdiff_prob_predictions_df["snp"] = snp_names
max_absdiff_prob_predictions_df["max_absdiff_prob"] = max_absdiff_prob_predictions  

# Write results to output files                                             
                                             
output_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-results_by_variant.csv"
max_absdiff_prob_predictions_df.to_csv(output_file, index=False, sep=',')
                                             
matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_matrix.csv"
prob_predictions_absdiff_dataset_nosnp.to_csv(matrix_file, index=False, sep=',')                   

In [None]:
snp_names = prob_predictions_ref_dataset["TFBS dataset"]

In [None]:
prob_predictions_ref_dataset_nosnp = prob_predictions_ref_dataset.drop("TFBS dataset", axis=1)
prob_predictions_alt_dataset_nosnp = prob_predictions_alt_dataset.drop("TFBS dataset", axis=1)

Find the differences between the two sets and take the absolute value

In [None]:
prob_predictions_diff_dataset_nosnp = prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp
prob_predictions_diff_dataset_nosnp

In [None]:
prob_predictions_absdiff_dataset_nosnp = abs(prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp)
prob_predictions_absdiff_dataset_nosnp


Find max absolute difference in probabilities across TFs

In [None]:
max_absdiff_prob_predictions = prob_predictions_absdiff_dataset_nosnp.max(axis=1)
max_absdiff_prob_predictions_df = pd.DataFrame(columns=["snp", "max_absdiff_prob"])
max_absdiff_prob_predictions_df["snp"] = snp_names
max_absdiff_prob_predictions_df["max_absdiff_prob"] = max_absdiff_prob_predictions
max_absdiff_prob_predictions_df[:50]




In [None]:
max_absdiff_prob_predictions_df["max_absdiff_prob"].describe()

Write results to output file

In [None]:
output_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-results_by_variant.csv"
max_absdiff_prob_predictions_df.to_csv(output_file, index=False, sep=',')


Find difference between the log odds scores of alleles

In [None]:
prob_predictions_logodds_dataset_nosnp = (np.log2(prob_predictions_ref_dataset_nosnp/(1-prob_predictions_ref_dataset_nosnp)))

Create and save matrix file

In [None]:
matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_matrix.csv"
prob_predictions_absdiff_dataset_nosnp.to_csv(matrix_file, index=False, sep=',')