## Installation

Installing Bio, transformers, genomic-benchmarks, and datasets packages.  The Bio package is from Biopython; transformers package for machine learning (pytorch, tensorflow); genomic-benchmarks and datasets from ML-Bioinfo-CEITEC.


In [None]:
# already set up on Expanse; toggle for colab

# pip install -qq Bio transformers genomic-benchmarks datasets transformers[torch] pyfaidx

Set path prefix for Expanse

In [1]:
path_prefix = "/expanse/lustre/projects/nji102/sgriesmer/"

Import SNP datasets

In [2]:
import pandas as pd

snp_prvcs = pd.read_csv(path_prefix + "DNABERT_2/Datasets/PRVCS/CAN_dataset_XY-named.csv", sep=',')
snp_prvcs.head(), snp_prvcs.shape



(           Name Chromosome  Position Ref Alt  label
 0  SNP_CAN_0001          1   1556217   G   A      0
 1  SNP_CAN_0002          1   2484134   C   G      0
 2  SNP_CAN_0003          1   2687821   T   C      0
 3  SNP_CAN_0004          1   4033387   G   A      0
 4  SNP_CAN_0005          1   4533386   C   A      0,
 (5023, 6))

In [3]:
snp_prvcs['label'].sum()

2517

Choose test set for run

In [4]:
snp_test = snp_prvcs
snp_start = 0
snp_end = 5023
#snp_end = 10492
snp_test_name = "SNP_PRVCS_REG_C" + '_' + str(snp_start) + '_' + str(snp_end)
#snp_test = snp_gt_5
#snp_test_name = "SNPgt5"

Import reference genome

In [5]:
from pyfaidx import Fasta

ref_genome = Fasta(path_prefix + "selene/selene_quickstart_tutorial/male.hg19.fasta")
ref_genome["chr1"]

FastaRecord("chr1")

Generate reference and alternative sequences from alleles and save as csv file.

In [6]:
# create a dataset

column_names = ["names", "ref_seq", "alt_seq"]
snp_seq_dataset = pd.DataFrame(columns=column_names)

# create reference and alternate sequences

seq_len = 75
for i,snp in enumerate(snp_test["Name"][snp_start:snp_end]):
  chrom = "chr" + str(snp_test["Chromosome"][i])
  pos = snp_test["Position"][i]
  ref_allele = snp_test["Ref"][i]
  alt_allele = snp_test["Alt"][i]
  ref_gen_sequence = ref_genome[chrom][int(pos)-seq_len-1:int(pos)+2*seq_len].seq
  if ref_gen_sequence[seq_len:seq_len+len(ref_allele)] == ref_allele:
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1]
  elif ref_gen_sequence[seq_len:seq_len+len(alt_allele)] == alt_allele:
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1]
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
  else:
    ref_sequence = ref_gen_sequence[0:seq_len] + ref_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+1]
    alt_sequence = ref_gen_sequence[0:seq_len] + alt_allele + ref_gen_sequence[seq_len+len(ref_allele):2*seq_len+len(ref_allele)-len(alt_allele)+1] 
  if len(ref_sequence) != len(alt_sequence):
    print(i, "!!mismatch!!")
  # make uppercase

  ref_sequence = ref_sequence.upper()
  alt_sequence = alt_sequence.upper()


  # write into dataset

  snp_seq_dataset.loc[i] = [snp, ref_sequence, alt_sequence]

Transform into Hugging Face Dataset for prediction

In [7]:
snp_seq_dataset

Unnamed: 0,names,ref_seq,alt_seq
0,SNP_CAN_0001,CTGGGAGCAGGAACCCTGCAGAGCCTGCCCCCCAGCTGGTGGCAGA...,CTGGGAGCAGGAACCCTGCAGAGCCTGCCCCCCAGCTGGTGGCAGA...
1,SNP_CAN_0002,CTCAGTCCCTTTCCCAGGGTCAGGGGTCCATGTATTGAACCAGGAC...,CTCAGTCCCTTTCCCAGGGTCAGGGGTCCATGTATTGAACCAGGAC...
2,SNP_CAN_0003,CTTCCAGGTGAGAATCTGACACCATAAAACAGCACCCTGCACCCCC...,CTTCCAGGTGAGAATCTGACACCATAAAACAGCACCCTGCACCCCC...
3,SNP_CAN_0004,GTCAAGTTCTCTACTCTTACTAGATGTCCAGAACAATTCCTTGTGC...,GTCAAGTTCTCTACTCTTACTAGATGTCCAGAACAATTCCTTGTGC...
4,SNP_CAN_0005,TCCCAGCCCCCATATTCTAGGGGAGAAGAAATAGACTCCAAGTCTT...,TCCCAGCCCCCATATTCTAGGGGAGAAGAAATAGACTCCAAGTCTT...
...,...,...,...
5018,SNP_CAN_5019,GTCCCTGATGTCCACATCGGCGTCCCATGTCCCCACTAGCAGCTTC...,GTCCCTGATGTCCACATCGGCGTCCCATGTCCCCACTAGCAGCTTC...
5019,SNP_CAN_5020,TCTTAAGAGTAGAGATGCAGAAGAGAGAGTGAGACCACGAAGAGAC...,TCTTAAGAGTAGAGATGCAGAAGAGAGAGTGAGACCACGAAGAGAC...
5020,SNP_CAN_5021,AGAGACTGGCTGTTGACTGCAGGGCACCACCAGCCGCCTTGGTGGT...,AGAGACTGGCTGTTGACTGCAGGGCACCACCAGCCGCCTTGGTGGT...
5021,SNP_CAN_5022,TGAAGTTCCAGTTGTTGTTTCACTGGAATAAATCTGCGTGGGTAGG...,TGAAGTTCCAGTTGTTGTTTCACTGGAATAAATCTGCGTGGGTAGG...


In [8]:
test_ref_seq = snp_seq_dataset["ref_seq"][4]
test_alt_seq = snp_seq_dataset["alt_seq"][4]

In [9]:
len(test_ref_seq), len(test_alt_seq)

(151, 151)

In [10]:
test_ref_seq[75], test_alt_seq[75]

('C', 'A')

In [11]:
from datasets import Dataset, DatasetDict, load_metric

Dataset_snp_seq = Dataset.from_pandas(snp_seq_dataset)

In [None]:
Dataset_snp_seq

Generate predictions on each TFBS feature for reference and alterative sequences

Make dataset to store predictions for all TFBS feature models

In [None]:
import xgboost

In [None]:
# create datasets

column_names = ["TFBS dataset"]
prob_predictions_ref_dataset = pd.DataFrame(columns=column_names)
prob_predictions_ref_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_alt_dataset = pd.DataFrame(columns=column_names)
prob_predictions_alt_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_diff_dataset = pd.DataFrame(columns=column_names)
prob_predictions_diff_dataset["TFBS dataset"] = Dataset_snp_seq["names"]
prob_predictions_odds_dataset = pd.DataFrame(columns=column_names)
prob_predictions_odds_dataset["TFBS dataset"] = Dataset_snp_seq["names"]


In [None]:
kmer = 6
#model_used = "armheb/DNA_bert_" + str(kmer)
model_used = "zhihan1996/DNABERT-2-117M"
train_bs = 32
eval_bs = 32
epochs = 3
warmup = 104
lr = 3.8e-5
save_steps = 200
eval_steps = 200
save_total_limit = 3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np
import sys
import os
from scipy.stats import zscore

# initialize parameters

for fname in [
  "BroadDnd41CtcfUniPk151-ran.csv",
  "BroadDnd41Ezh239875UniPk151-ran.csv",
  "BroadGm12878CtcfUniPk151-ran.csv",
  "BroadGm12878Ezh239875UniPk151-ran.csv",
  "BroadH1hescChd1a301218aUniPk151-ran.csv",
  "BroadH1hescCtcfUniPk151-ran.csv",
  "BroadH1hescEzh239875UniPk151-ran.csv",
  "BroadH1hescJarid1aab26049UniPk151-ran.csv",
  "BroadH1hescRbbp5a300109aUniPk151-ran.csv",
  "BroadHelas3CtcfUniPk151-ran.csv",
  "BroadHelas3Ezh239875UniPk151-ran.csv",
  "BroadHelas3Pol2bUniPk151-ran.csv",
  "BroadHepg2CtcfUniPk151-ran.csv",
  "BroadHepg2Ezh239875UniPk151-ran.csv",
  "BroadHmecCtcfUniPk151-ran.csv",
  "BroadHmecEzh239875UniPk151-ran.csv",
  "BroadHsmmCtcfUniPk151-ran.csv",
  "BroadHsmmEzh239875UniPk151-ran.csv",
  "BroadHsmmtCtcfUniPk151-ran.csv",
  "BroadHsmmtEzh239875UniPk151-ran.csv",
  "BroadHuvecCtcfUniPk151-ran.csv",
  "BroadHuvecEzh239875UniPk151-ran.csv",
  "BroadHuvecPol2bUniPk151-ran.csv",
  "BroadK562Chd1a301218aUniPk151-ran.csv",
  "BroadK562CtcfUniPk151-ran.csv",
  "BroadK562Ezh239875UniPk151-ran.csv",
  "BroadK562Hdac1sc6298UniPk151-ran.csv",
  "BroadK562Hdac2a300705aUniPk151-ran.csv",
  "BroadK562Hdac6a301341aUniPk151-ran.csv",
  "BroadK562P300UniPk151-ran.csv",
  "BroadK562Phf8a301772aUniPk151-ran.csv",
  "BroadK562Plu1UniPk151-ran.csv",
  "BroadK562Pol2bUniPk151-ran.csv",
  "BroadK562Rbbp5a300109aUniPk151-ran.csv",
  "BroadK562Sap3039731UniPk151-ran.csv",
  "BroadNhaCtcfUniPk151-ran.csv",
  "BroadNhaEzh239875UniPk151-ran.csv",
  "BroadNhdfadCtcfUniPk151-ran.csv",
  "BroadNhdfadEzh239875UniPk151-ran.csv",
  "BroadNhekCtcfUniPk151-ran.csv",
  "BroadNhekEzh239875UniPk151-ran.csv",
  "BroadNhekPol2bUniPk151-ran.csv",
  "BroadNhlfCtcfUniPk151-ran.csv",
  "BroadNhlfEzh239875UniPk151-ran.csv",
  "BroadOsteoblCtcfUniPk151-ran.csv",
  "HaibA549Atf3V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Bcl3V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Creb1sc240V0416102Dex100nmUniPk151-ran.csv",
  "HaibA549Ctcfsc5916Pcr1xDex100nmUniPk151-ran.csv",
  "HaibA549Ctcfsc5916Pcr1xEtoh02UniPk151-ran.csv",
  "HaibA549Elf1V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Ets1V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Fosl2V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Foxa1V0416102Dex100nmUniPk151-ran.csv",
  "HaibA549GabpV0422111Etoh02UniPk151-ran.csv",
  "HaibA549GrPcr1xDex500pmUniPk151-ran.csv",
  "HaibA549GrPcr1xDex50nmUniPk151-ran.csv",
  "HaibA549GrPcr1xDex5nmUniPk151-ran.csv",
  "HaibA549GrPcr2xDex100nmUniPk151-ran.csv",
  "HaibA549NrsfV0422111Etoh02UniPk151-ran.csv",
  "HaibA549P300V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Pol2Pcr2xDex100nmUniPk151-ran.csv",
  "HaibA549Pol2Pcr2xEtoh02UniPk151-ran.csv",
  "HaibA549Sin3ak20V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Six5V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Taf1V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Tcf12V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Usf1Pcr1xDex100nmUniPk151-ran.csv",
  "HaibA549Usf1Pcr1xEtoh02UniPk151-ran.csv",
  "HaibA549Usf1V0422111Etoh02UniPk151-ran.csv",
  "HaibA549Yy1cV0422111Etoh02UniPk151-ran.csv",
  "HaibA549Zbtb33V0422111Etoh02UniPk151-ran.csv",
  "HaibEcc1CtcfcV0416102Dm002p1hUniPk151-ran.csv",
  "HaibEcc1EraaV0416102Bpa1hUniPk151-ran.csv",
  "HaibEcc1EralphaaV0416102Est10nm1hUniPk151-ran.csv",
  "HaibEcc1EralphaaV0416102Gen1hUniPk151-ran.csv",
  "HaibEcc1Foxa1sc6553V0416102Dm002p1hUniPk151-ran.csv",
  "HaibEcc1GrV0416102Dex100nmUniPk151-ran.csv",
  "HaibEcc1Pol2V0416102Dm002p1hUniPk151-ran.csv",
  "HaibGm12878Atf2sc81188V0422111UniPk151-ran.csv",
  "HaibGm12878Atf3Pcr1xUniPk151-ran.csv",
  "HaibGm12878BatfPcr1xUniPk151-ran.csv",
  "HaibGm12878Bcl11aPcr1xUniPk151-ran.csv",
  "HaibGm12878Bcl3V0416101UniPk151-ran.csv",
  "HaibGm12878Bclaf101388V0416101UniPk151-ran.csv",
  "HaibGm12878Cebpbsc150V0422111UniPk151-ran.csv",
  "HaibGm12878Ebf1sc137065Pcr1xUniPk151-ran.csv",
  "HaibGm12878Egr1Pcr2xUniPk151-ran.csv",
  "HaibGm12878Elf1sc631V0416101UniPk151-ran.csv",
  "HaibGm12878Ets1Pcr1xUniPk151-ran.csv",
  "HaibGm12878Foxm1sc502V0422111UniPk151-ran.csv",
  "HaibGm12878GabpPcr2xUniPk151-ran.csv",
  "HaibGm12878Irf4sc6059Pcr1xUniPk151-ran.csv",
  "HaibGm12878Mef2aPcr1xUniPk151-ran.csv",
  "HaibGm12878Mef2csc13268V0416101UniPk151-ran.csv",
  "HaibGm12878Mta3sc81325V0422111UniPk151-ran.csv",
  "HaibGm12878Nfatc1sc17834V0422111UniPk151-ran.csv",
  "HaibGm12878Nficsc81335V0422111UniPk151-ran.csv",
  "HaibGm12878NrsfPcr1xUniPk151-ran.csv",
  "HaibGm12878P300Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pax5c20Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pax5n19Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pbx3Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pmlsc71910V0422111UniPk151-ran.csv",
  "HaibGm12878Pol24h8Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pol2Pcr2xUniPk151-ran.csv",
  "HaibGm12878Pou2f2Pcr1xUniPk151-ran.csv",
  "HaibGm12878Pu1Pcr1xUniPk151-ran.csv",
  "HaibGm12878Rad21V0416101UniPk151-ran.csv",
  "HaibGm12878Runx3sc101553V0422111UniPk151-ran.csv",
  "HaibGm12878RxraPcr1xUniPk151-ran.csv",
  "HaibGm12878Six5Pcr1xUniPk151-ran.csv",
  "HaibGm12878Sp1Pcr1xUniPk151-ran.csv",
  "HaibGm12878SrfPcr2xUniPk151-ran.csv",
  "HaibGm12878Stat5asc74442V0422111UniPk151-ran.csv",
  "HaibGm12878Taf1Pcr1xUniPk151-ran.csv",
  "HaibGm12878Tcf12Pcr1xUniPk151-ran.csv",
  "HaibGm12878Tcf3Pcr1xUniPk151-ran.csv",
  "HaibGm12878Usf1Pcr2xUniPk151-ran.csv",
  "HaibGm12878Yy1sc281Pcr1xUniPk151-ran.csv",
  "HaibGm12878Zbtb33Pcr1xUniPk151-ran.csv",
  "HaibGm12878Zeb1sc25388V0416102UniPk151-ran.csv",
  "HaibGm12891Pax5c20V0416101UniPk151-ran.csv",
  "HaibGm12891Pol24h8Pcr1xUniPk151-ran.csv",
  "HaibGm12891Pol2Pcr1xUniPk151-ran.csv",
  "HaibGm12891Pou2f2Pcr1xUniPk151-ran.csv",
  "HaibGm12891Pu1Pcr1xUniPk151-ran.csv",
  "HaibGm12891Taf1Pcr1xUniPk151-ran.csv",
  "HaibGm12891Yy1sc281V0416101UniPk151-ran.csv",
  "HaibGm12892Pax5c20V0416101UniPk151-ran.csv",
  "HaibGm12892Pol24h8V0416102UniPk151-ran.csv",
  "HaibGm12892Pol2V0416102UniPk151-ran.csv",
  "HaibGm12892Taf1V0416102UniPk151-ran.csv",
  "HaibGm12892Yy1V0416101UniPk151-ran.csv",
  "HaibH1hescAtf2sc81188V0422111UniPk151-ran.csv",
  "HaibH1hescAtf3V0416102UniPk151-ran.csv",
  "HaibH1hescBcl11aPcr1xUniPk151-ran.csv",
  "HaibH1hescCtcfsc5916V0416102UniPk151-ran.csv",
  "HaibH1hescEgr1V0416102UniPk151-ran.csv",
  "HaibH1hescFosl1sc183V0416102UniPk151-ran.csv",
  "HaibH1hescGabpPcr1xUniPk151-ran.csv",
  "HaibH1hescHdac2sc6296V0416102UniPk151-ran.csv",
  "HaibH1hescJundV0416102UniPk151-ran.csv",
  "HaibH1hescNanogsc33759V0416102UniPk151-ran.csv",
  "HaibH1hescNrsfV0416102UniPk151-ran.csv",
  "HaibH1hescP300V0416102UniPk151-ran.csv",
  "HaibH1hescPol24h8V0416102UniPk151-ran.csv",
  "HaibH1hescPol2V0416102UniPk151-ran.csv",
  "HaibH1hescPou5f1sc9081V0416102UniPk151-ran.csv",
  "HaibH1hescRad21V0416102UniPk151-ran.csv",
  "HaibH1hescRxraV0416102UniPk151-ran.csv",
  "HaibH1hescSin3ak20Pcr1xUniPk151-ran.csv",
  "HaibH1hescSix5Pcr1xUniPk151-ran.csv",
  "HaibH1hescSp1Pcr1xUniPk151-ran.csv",
  "HaibH1hescSp2V0422111UniPk151-ran.csv",
  "HaibH1hescSp4v20V0422111UniPk151-ran.csv",
  "HaibH1hescSrfPcr1xUniPk151-ran.csv",
  "HaibH1hescTaf1V0416102UniPk151-ran.csv",
  "HaibH1hescTaf7sc101167V0416102UniPk151-ran.csv",
  "HaibH1hescTcf12Pcr1xUniPk151-ran.csv",
  "HaibH1hescTead4sc101184V0422111UniPk151-ran.csv",
  "HaibH1hescUsf1Pcr1xUniPk151-ran.csv",
  "HaibH1hescYy1sc281V0416102UniPk151-ran.csv",
  "HaibHct116Pol24h8V0416101UniPk151-ran.csv",
  "HaibHct116Yy1sc281V0416101UniPk151-ran.csv",
  "HaibHct116Zbtb33V0416101UniPk151-ran.csv",
  "HaibHelas3GabpPcr1xUniPk151-ran.csv",
  "HaibHelas3NrsfPcr1xUniPk151-ran.csv",
  "HaibHelas3Pol2Pcr1xUniPk151-ran.csv",
  "HaibHelas3Taf1Pcr1xUniPk151-ran.csv",
  "HaibHepg2Atf3V0416101UniPk151-ran.csv",
  "HaibHepg2Bhlhe40V0416101UniPk151-ran.csv",
  "HaibHepg2Cebpbsc150V0416101UniPk151-ran.csv",
  "HaibHepg2Cebpdsc636V0416101UniPk151-ran.csv",
  "HaibHepg2Ctcfsc5916V0416101UniPk151-ran.csv",
  "HaibHepg2Elf1sc631V0416101UniPk151-ran.csv",
  "HaibHepg2Fosl2V0416101UniPk151-ran.csv",
  "HaibHepg2Foxa1sc101058V0416101UniPk151-ran.csv",
  "HaibHepg2Foxa1sc6553V0416101UniPk151-ran.csv",
  "HaibHepg2Foxa2sc6554V0416101UniPk151-ran.csv",
  "HaibHepg2GabpPcr2xUniPk151-ran.csv",
  "HaibHepg2Hdac2sc6296V0416101UniPk151-ran.csv",
  "HaibHepg2Hnf4asc8987V0416101UniPk151-ran.csv",
  "HaibHepg2Hnf4gsc6558V0416101UniPk151-ran.csv",
  "HaibHepg2JundPcr1xUniPk151-ran.csv",
  "HaibHepg2Mbd4sc271530V0422111UniPk151-ran.csv",
  "HaibHepg2Mybl2sc81192V0422111UniPk151-ran.csv",
  "HaibHepg2Nficsc81335V0422111UniPk151-ran.csv",
  "HaibHepg2NrsfPcr2xUniPk151-ran.csv",
  "HaibHepg2NrsfV0416101UniPk151-ran.csv",
  "HaibHepg2P300V0416101UniPk151-ran.csv",
  "HaibHepg2Pol24h8V0416102UniPk151-ran.csv",
  "HaibHepg2Pol2Pcr2xUniPk151-ran.csv",
  "HaibHepg2Rad21V0416101UniPk151-ran.csv",
  "HaibHepg2RxraPcr1xUniPk151-ran.csv",
  "HaibHepg2Sin3ak20Pcr1xUniPk151-ran.csv",
  "HaibHepg2Sp1Pcr1xUniPk151-ran.csv",
  "HaibHepg2Sp2V0422111UniPk151-ran.csv",
  "HaibHepg2SrfV0416101UniPk151-ran.csv",
  "HaibHepg2Taf1Pcr2xUniPk151-ran.csv",
  "HaibHepg2Tcf12Pcr1xUniPk151-ran.csv",
  "HaibHepg2Tead4sc101184V0422111UniPk151-ran.csv",
  "HaibHepg2Usf1Pcr1xUniPk151-ran.csv",
  "HaibHepg2Yy1sc281V0416101UniPk151-ran.csv",
  "HaibHepg2Zbtb33Pcr1xUniPk151-ran.csv",
  "HaibHepg2Zbtb7aV0416101UniPk151-ran.csv",
  "HaibHuvecPol24h8V0416101UniPk151-ran.csv",
  "HaibHuvecPol2Pcr1xUniPk151-ran.csv",
  "HaibK562Atf3V0416101UniPk151-ran.csv",
  "HaibK562Bcl3Pcr1xUniPk151-ran.csv",
  "HaibK562Bclaf101388Pcr1xUniPk151-ran.csv",
  "HaibK562Cbx3sc101004V0422111UniPk151-ran.csv",
  "HaibK562Cebpbsc150V0422111UniPk151-ran.csv",
  "HaibK562CtcfcPcr1xUniPk151-ran.csv",
  "HaibK562Ctcflsc98982V0416101UniPk151-ran.csv",
  "HaibK562E2f6V0416102UniPk151-ran.csv",
  "HaibK562Egr1V0416101UniPk151-ran.csv",
  "HaibK562Elf1sc631V0416102UniPk151-ran.csv",
  "HaibK562Ets1V0416101UniPk151-ran.csv",
  "HaibK562Fosl1sc183V0416101UniPk151-ran.csv",
  "HaibK562GabpV0416101UniPk151-ran.csv",
  "HaibK562Gata2sc267Pcr1xUniPk151-ran.csv",
  "HaibK562Hdac2sc6296V0416102UniPk151-ran.csv",
  "HaibK562MaxV0416102UniPk151-ran.csv",
  "HaibK562Mef2aV0416101UniPk151-ran.csv",
  "HaibK562Nr2f2sc271940V0422111UniPk151-ran.csv",
  "HaibK562NrsfV0416102UniPk151-ran.csv",
  "HaibK562Pmlsc71910V0422111UniPk151-ran.csv",
  "HaibK562Pol24h8V0416101UniPk151-ran.csv",
  "HaibK562Pol2V0416101UniPk151-ran.csv",
  "HaibK562Pu1Pcr1xUniPk151-ran.csv",
  "HaibK562Rad21V0416102UniPk151-ran.csv",
  "HaibK562Sin3ak20V0416101UniPk151-ran.csv",
  "HaibK562Six5Pcr1xUniPk151-ran.csv",
  "HaibK562Sp1Pcr1xUniPk151-ran.csv",
  "HaibK562Sp2sc643V0416102UniPk151-ran.csv",
  "HaibK562SrfV0416101UniPk151-ran.csv",
  "HaibK562Stat5asc74442V0422111UniPk151-ran.csv",
  "HaibK562Taf1V0416101UniPk151-ran.csv",
  "HaibK562Taf7sc101167V0416101UniPk151-ran.csv",
  "HaibK562Tead4sc101184V0422111UniPk151-ran.csv",
  "HaibK562Thap1sc98174V0416101UniPk151-ran.csv",
  "HaibK562Trim28sc81411V0422111UniPk151-ran.csv",
  "HaibK562Usf1V0416101UniPk151-ran.csv",
  "HaibK562Yy1V0416101UniPk151-ran.csv",
  "HaibK562Yy1V0416102UniPk151-ran.csv",
  "HaibK562Zbtb33Pcr1xUniPk151-ran.csv",
  "HaibK562Zbtb7asc34508V0416101UniPk151-ran.csv",
  "HaibPanc1NrsfPcr2xUniPk151-ran.csv",
  "HaibPanc1Pol24h8V0416101UniPk151-ran.csv",
  "HaibPanc1Sin3ak20V0416101UniPk151-ran.csv",
  "HaibPfsk1Foxp2Pcr2xUniPk151-ran.csv",
  "HaibPfsk1NrsfPcr2xUniPk151-ran.csv",
  "HaibPfsk1Sin3ak20V0416101UniPk151-ran.csv",
  "HaibPfsk1Taf1V0416101UniPk151-ran.csv",
  "HaibSknmcFoxp2Pcr2xUniPk151-ran.csv",
  "HaibSknmcPol24h8V0416101UniPk151-ran.csv",
  "HaibSknshNrsfPcr2xUniPk151-ran.csv",
  "HaibSknshNrsfV0416101UniPk151-ran.csv",
  "HaibSknshPol24h8V0416101UniPk151-ran.csv",
  "HaibSknshSin3ak20V0416101UniPk151-ran.csv",
  "HaibSknshTaf1V0416101UniPk151-ran.csv",
  "HaibSknshraCtcfV0416102UniPk151-ran.csv",
  "HaibSknshraP300V0416102UniPk151-ran.csv",
  "HaibSknshraRad21V0416102UniPk151-ran.csv",
  "HaibSknshraUsf1sc8983V0416102UniPk151-ran.csv",
  "HaibSknshraYy1sc281V0416102UniPk151-ran.csv",
  "HaibT47dCtcfsc5916V0416102Dm002p1hUniPk151-ran.csv",
  "HaibT47dEraaV0416102Bpa1hUniPk151-ran.csv",
  "HaibT47dEralphaaPcr2xGen1hUniPk151-ran.csv",
  "HaibT47dEralphaaV0416102Est10nm1hUniPk151-ran.csv",
  "HaibT47dFoxa1sc6553V0416102Dm002p1hUniPk151-ran.csv",
  "HaibT47dGata3sc268V0416102Dm002p1hUniPk151-ran.csv",
  "HaibT47dP300V0416102Dm002p1hUniPk151-ran.csv",
  "HaibU87NrsfPcr2xUniPk151-ran.csv",
  "HaibU87Pol24h8V0416101UniPk151-ran.csv",
  "SydhA549Bhlhe40IggrabUniPk151-ran.csv",
  "SydhA549CebpbIggrabUniPk151-ran.csv",
  "SydhA549MaxIggrabUniPk151-ran.csv",
  "SydhA549Pol2s2IggrabUniPk151-ran.csv",
  "SydhA549Rad21IggrabUniPk151-ran.csv",
  "SydhGm08714Znf274UcdUniPk151-ran.csv",
  "SydhGm10847NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm10847Pol2IggmusUniPk151-ran.csv",
  "SydhGm12878Bhlhe40cIggmusUniPk151-ran.csv",
  "SydhGm12878Brca1a300IggmusUniPk151-ran.csv",
  "SydhGm12878CfosUniPk151-ran.csv",
  "SydhGm12878Chd1a301218aIggmusUniPk151-ran.csv",
  "SydhGm12878Chd2ab68301IggmusUniPk151-ran.csv",
  "SydhGm12878Corestsc30189IggmusUniPk151-ran.csv",
  "SydhGm12878Ctcfsc15914c20UniPk151-ran.csv",
  "SydhGm12878E2f4IggmusUniPk151-ran.csv",
  "SydhGm12878Ebf1sc137065UniPk151-ran.csv",
  "SydhGm12878Elk112771IggmusUniPk151-ran.csv",
  "SydhGm12878Ikzf1iknuclaUniPk151-ran.csv",
  "SydhGm12878JundUniPk151-ran.csv",
  "SydhGm12878MaxIggmusUniPk151-ran.csv",
  "SydhGm12878Mazab85725IggmusUniPk151-ran.csv",
  "SydhGm12878Mxi1IggmusUniPk151-ran.csv",
  "SydhGm12878Nfe2sc22827UniPk151-ran.csv",
  "SydhGm12878NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm12878NfyaIggmusUniPk151-ran.csv",
  "SydhGm12878NfybIggmusUniPk151-ran.csv",
  "SydhGm12878Nrf1IggmusUniPk151-ran.csv",
  "SydhGm12878P300IggmusUniPk151-ran.csv",
  "SydhGm12878P300bUniPk151-ran.csv",
  "SydhGm12878Pol2IggmusUniPk151-ran.csv",
  "SydhGm12878Pol2UniPk151-ran.csv",
  "SydhGm12878Pol2s2IggmusUniPk151-ran.csv",
  "SydhGm12878Pol3UniPk151-ran.csv",
  "SydhGm12878Rad21IggrabUniPk151-ran.csv",
  "SydhGm12878Rfx5200401194IggmusUniPk151-ran.csv",
  "SydhGm12878Sin3anb6001263IggmusUniPk151-ran.csv",
  "SydhGm12878Smc3ab9263IggmusUniPk151-ran.csv",
  "SydhGm12878Stat1UniPk151-ran.csv",
  "SydhGm12878Stat3IggmusUniPk151-ran.csv",
  "SydhGm12878Tblr1ab24550IggmusUniPk151-ran.csv",
  "SydhGm12878TbpIggmusUniPk151-ran.csv",
  "SydhGm12878Tr4UniPk151-ran.csv",
  "SydhGm12878Usf2IggmusUniPk151-ran.csv",
  "SydhGm12878WhipIggmusUniPk151-ran.csv",
  "SydhGm12878Yy1UniPk151-ran.csv",
  "SydhGm12878Znf143166181apUniPk151-ran.csv",
  "SydhGm12878Znf274UniPk151-ran.csv",
  "SydhGm12878Zzz3UniPk151-ran.csv",
  "SydhGm12891NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm12891Pol2IggmusUniPk151-ran.csv",
  "SydhGm12892NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm12892Pol2IggmusUniPk151-ran.csv",
  "SydhGm15510NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm15510Pol2IggmusUniPk151-ran.csv",
  "SydhGm18505NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18505Pol2IggmusUniPk151-ran.csv",
  "SydhGm18526NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18526Pol2IggmusUniPk151-ran.csv",
  "SydhGm18951NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18951Pol2IggmusUniPk151-ran.csv",
  "SydhGm19099NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm19099Pol2IggmusUniPk151-ran.csv",
  "SydhGm19193NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm19193Pol2IggmusUniPk151-ran.csv",
  "SydhH1hescBach1sc14700IggrabUniPk151-ran.csv",
  "SydhH1hescBrca1IggrabUniPk151-ran.csv",
  "SydhH1hescCebpbIggrabUniPk151-ran.csv",
  "SydhH1hescChd1a301218aIggrabUniPk151-ran.csv",
  "SydhH1hescChd2IggrabUniPk151-ran.csv",
  "SydhH1hescCjunIggrabUniPk151-ran.csv",
  "SydhH1hescCmycIggrabUniPk151-ran.csv",
  "SydhH1hescCtbp2UcdUniPk151-ran.csv",
  "SydhH1hescGtf2f1IggrabUniPk151-ran.csv",
  "SydhH1hescJundIggrabUniPk151-ran.csv",
  "SydhH1hescMafkIggrabUniPk151-ran.csv",
  "SydhH1hescMaxUcdUniPk151-ran.csv",
  "SydhH1hescMxi1IggrabUniPk151-ran.csv",
  "SydhH1hescNrf1IggrabUniPk151-ran.csv",
  "SydhH1hescRad21IggrabUniPk151-ran.csv",
  "SydhH1hescRfx5200401194IggrabUniPk151-ran.csv",
  "SydhH1hescSin3anb6001263IggrabUniPk151-ran.csv",
  "SydhH1hescSuz12UcdUniPk151-ran.csv",
  "SydhH1hescTbpIggrabUniPk151-ran.csv",
  "SydhH1hescUsf2IggrabUniPk151-ran.csv",
  "SydhH1hescZnf143IggrabUniPk151-ran.csv",
  "SydhGm12878NfybIggmusUniPk151-ran.csv",
  "SydhGm12878Nrf1IggmusUniPk151-ran.csv",
  "SydhGm12878P300IggmusUniPk151-ran.csv",
  "SydhGm12878P300bUniPk151-ran.csv",
  "SydhGm12878Pol2IggmusUniPk151-ran.csv",
  "SydhGm12878Pol2UniPk151-ran.csv",
  "SydhGm12878Pol2s2IggmusUniPk151-ran.csv",
  "SydhGm12878Pol3UniPk151-ran.csv",
  "SydhGm12878Rad21IggrabUniPk151-ran.csv",
  "SydhGm12878Rfx5200401194IggmusUniPk151-ran.csv",
  "SydhGm12878Sin3anb6001263IggmusUniPk151-ran.csv",
  "SydhGm12878Smc3ab9263IggmusUniPk151-ran.csv",
  "SydhGm12878Stat1UniPk151-ran.csv",
  "SydhGm12878Stat3IggmusUniPk151-ran.csv",
  "SydhGm12878Tblr1ab24550IggmusUniPk151-ran.csv",
  "SydhGm12878TbpIggmusUniPk151-ran.csv",
  "SydhGm12878Tr4UniPk151-ran.csv",
  "SydhGm12878Usf2IggmusUniPk151-ran.csv",
  "SydhGm12878WhipIggmusUniPk151-ran.csv",
  "SydhGm12878Yy1UniPk151-ran.csv",
  "SydhGm12878Znf143166181apUniPk151-ran.csv",
  "SydhGm12878Znf274UniPk151-ran.csv",
  "SydhGm12878Zzz3UniPk151-ran.csv",
  "SydhGm12891NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm12891Pol2IggmusUniPk151-ran.csv",
  "SydhGm12892NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm12892Pol2IggmusUniPk151-ran.csv",
  "SydhGm15510NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm15510Pol2IggmusUniPk151-ran.csv",
  "SydhGm18505NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18505Pol2IggmusUniPk151-ran.csv",
  "SydhGm18526NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18526Pol2IggmusUniPk151-ran.csv",
  "SydhGm18951NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm18951Pol2IggmusUniPk151-ran.csv",
  "SydhGm19099NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm19099Pol2IggmusUniPk151-ran.csv",
  "SydhGm19193NfkbTnfaIggrabUniPk151-ran.csv",
  "SydhGm19193Pol2IggmusUniPk151-ran.csv",
  "SydhH1hescBach1sc14700IggrabUniPk151-ran.csv",
  "SydhH1hescBrca1IggrabUniPk151-ran.csv",
  "SydhH1hescCebpbIggrabUniPk151-ran.csv",
  "SydhH1hescChd1a301218aIggrabUniPk151-ran.csv",
  "SydhH1hescChd2IggrabUniPk151-ran.csv",
  "SydhH1hescCjunIggrabUniPk151-ran.csv",
  "SydhH1hescCmycIggrabUniPk151-ran.csv",
  "SydhH1hescCtbp2UcdUniPk151-ran.csv",
  "SydhH1hescGtf2f1IggrabUniPk151-ran.csv",
  "SydhH1hescJundIggrabUniPk151-ran.csv",
  "SydhH1hescMafkIggrabUniPk151-ran.csv",
  "SydhH1hescMaxUcdUniPk151-ran.csv",
  "SydhH1hescMxi1IggrabUniPk151-ran.csv",
  "SydhH1hescNrf1IggrabUniPk151-ran.csv",
  "SydhH1hescRad21IggrabUniPk151-ran.csv",
  "SydhH1hescRfx5200401194IggrabUniPk151-ran.csv",
  "SydhH1hescSin3anb6001263IggrabUniPk151-ran.csv",
  "SydhH1hescSuz12UcdUniPk151-ran.csv",
  "SydhH1hescTbpIggrabUniPk151-ran.csv",
  "SydhH1hescUsf2IggrabUniPk151-ran.csv",
  "SydhH1hescZnf143IggrabUniPk151-ran.csv",
  "SydhHct116Pol2UcdUniPk151-ran.csv",
  "SydhHct116Tcf7l2UcdUniPk151-ran.csv",
  "SydhHek293Elk4UcdUniPk151-ran.csv",
  "SydhHek293Kap1UcdUniPk151-ran.csv",
  "SydhHek293Pol2UniPk151-ran.csv",
  "SydhHek293Tcf7l2UcdUniPk151-ran.csv",
  "SydhHek293tZnf263UcdUniPk151-ran.csv",
  "SydhHelas3Ap2alphaUniPk151-ran.csv",
  "SydhHelas3Ap2gammaUniPk151-ran.csv",
  "SydhHelas3Baf155IggmusUniPk151-ran.csv",
  "SydhHelas3Baf170IggmusUniPk151-ran.csv",
  "SydhHelas3Bdp1UniPk151-ran.csv",
  "SydhHelas3Brca1a300IggrabUniPk151-ran.csv",
  "SydhHelas3Brf1UniPk151-ran.csv",
  "SydhHelas3Brf2UniPk151-ran.csv",
  "SydhHelas3Brg1IggmusUniPk151-ran.csv",
  "SydhHelas3CebpbIggrabUniPk151-ran.csv",
  "SydhHelas3CfosUniPk151-ran.csv",
  "SydhHelas3Chd2IggrabUniPk151-ran.csv",
  "SydhHelas3CjunIggrabUniPk151-ran.csv",
  "SydhHelas3CmycUniPk151-ran.csv",
  "SydhHelas3Corestsc30189IggrabUniPk151-ran.csv",
  "SydhHelas3E2f1UniPk151-ran.csv",
  "SydhHelas3E2f4UniPk151-ran.csv",
  "SydhHelas3E2f6UniPk151-ran.csv",
  "SydhHelas3Elk112771IggrabUniPk151-ran.csv",
  "SydhHelas3Elk4UcdUniPk151-ran.csv",
  "SydhHelas3Gtf2f1ab28179IggrabUniPk151-ran.csv",
  "SydhHelas3Hae2f1UniPk151-ran.csv",
  "SydhHelas3Ini1IggmusUniPk151-ran.csv",
  "SydhHelas3Irf3IggrabUniPk151-ran.csv",
  "SydhHelas3JundIggrabUniPk151-ran.csv",
  "SydhHelas3MafkIggrabUniPk151-ran.csv",
  "SydhHelas3MaxIggrabUniPk151-ran.csv",
  "SydhHelas3Mazab85725IggrabUniPk151-ran.csv",
  "SydhHelas3Mxi1af4185IggrabUniPk151-ran.csv",
  "SydhHelas3NfyaIggrabUniPk151-ran.csv",
  "SydhHelas3NfybIggrabUniPk151-ran.csv",
  "SydhHelas3Nrf1IggmusUniPk151-ran.csv",
  "SydhHelas3P300sc584sc584IggrabUniPk151-ran.csv",
  "SydhHelas3Pol2UniPk151-ran.csv",
  "SydhHelas3Pol2s2IggrabUniPk151-ran.csv",
  "SydhHelas3Prdm19115IggrabUniPk151-ran.csv",
  "SydhHelas3Rad21IggrabUniPk151-ran.csv",
  "SydhHelas3Rfx5200401194IggrabUniPk151-ran.csv",
  "SydhHelas3Rpc155UniPk151-ran.csv",
  "SydhHelas3Smc3ab9263IggrabUniPk151-ran.csv",
  "SydhHelas3Spt20UniPk151-ran.csv",
  "SydhHelas3Stat1Ifng30UniPk151-ran.csv",
  "SydhHelas3Stat3IggrabUniPk151-ran.csv",
  "SydhHelas3TbpIggrabUniPk151-ran.csv",
  "SydhHelas3Tcf7l2UcdUniPk151-ran.csv",
  "SydhHelas3Tcf7l2c9b92565UcdUniPk151-ran.csv",
  "SydhHelas3Tf3c110UniPk151-ran.csv",
  "SydhHelas3Tr4UniPk151-ran.csv",
  "SydhHelas3Usf2IggmusUniPk151-ran.csv",
  "SydhHelas3Zkscan1hpa006672IggrabUniPk151-ran.csv",
  "SydhHelas3Znf143IggrabUniPk151-ran.csv",
  "SydhHelas3Znf274UcdUniPk151-ran.csv",
  "SydhHelas3Zzz3UniPk151-ran.csv",
  "SydhHepg2Arid3anb100279IggrabUniPk151-ran.csv",
  "SydhHepg2Bhlhe40cIggrabUniPk151-ran.csv",
  "SydhHepg2Brca1a300IggrabUniPk151-ran.csv",
  "SydhHepg2CebpbForsklnUniPk151-ran.csv",
  "SydhHepg2CebpbIggrabUniPk151-ran.csv",
  "SydhHepg2Chd2ab68301IggrabUniPk151-ran.csv",
  "SydhHepg2CjunIggrabUniPk151-ran.csv",
  "SydhHepg2Corestsc30189IggrabUniPk151-ran.csv",
  "SydhHepg2ErraForsklnUniPk151-ran.csv",
  "SydhHepg2Grp20ForsklnUniPk151-ran.csv",
  "SydhHepg2Hnf4aForsklnUniPk151-ran.csv",
  "SydhHepg2Hsf1ForsklnUniPk151-ran.csv",
  "SydhHepg2Irf3IggrabUniPk151-ran.csv",
  "SydhHepg2JundIggrabUniPk151-ran.csv",
  "SydhHepg2Maffm8194IggrabUniPk151-ran.csv",
  "SydhHepg2Mafkab50322IggrabUniPk151-ran.csv",
  "SydhHepg2Mafksc477IggrabUniPk151-ran.csv",
  "SydhHepg2MaxIggrabUniPk151-ran.csv",
  "SydhHepg2Mazab85725IggrabUniPk151-ran.csv",
  "SydhHepg2Mxi1UniPk151-ran.csv",
  "SydhHepg2Nrf1IggrabUniPk151-ran.csv",
  "SydhHepg2P300sc582IggrabUniPk151-ran.csv",
  "SydhHepg2Pgc1aForsklnUniPk151-ran.csv",
  "SydhHepg2Pol2ForsklnUniPk151-ran.csv",
  "SydhHepg2Pol2IggrabUniPk151-ran.csv",
  "SydhHepg2Pol2s2IggrabUniPk151-ran.csv",
  "SydhHepg2Rad21IggrabUniPk151-ran.csv",
  "SydhHepg2Rfx5200401194IggrabUniPk151-ran.csv",
  "SydhHepg2Smc3ab9263IggrabUniPk151-ran.csv",
  "SydhHepg2Srebp1InslnUniPk151-ran.csv",
  "SydhHepg2TbpIggrabUniPk151-ran.csv",
  "SydhHepg2Tcf7l2UcdUniPk151-ran.csv",
  "SydhHepg2Tr4UcdUniPk151-ran.csv",
  "SydhHepg2Usf2IggrabUniPk151-ran.csv",
  "SydhHepg2Znf274UcdUniPk151-ran.csv",
  "SydhHuvecCfosUcdUniPk151-ran.csv",
  "SydhHuvecCjunUniPk151-ran.csv",
  "SydhHuvecGata2UcdUniPk151-ran.csv",
  "SydhHuvecMaxUniPk151-ran.csv",
  "SydhHuvecPol2UniPk151-ran.csv",
  "SydhImr90CebpbIggrabUniPk151-ran.csv",
  "SydhImr90CtcfbIggrabUniPk151-ran.csv",
  "SydhImr90MafkIggrabUniPk151-ran.csv",
  "SydhImr90Pol2IggrabUniPk151-ran.csv",
  "SydhImr90Rad21IggrabUniPk151-ran.csv",
  "SydhK562Arid3asc8821IggrabUniPk151-ran.csv",
  "SydhK562Atf106325UniPk151-ran.csv",
  "SydhK562Atf3UniPk151-ran.csv",
  "SydhK562Bach1sc14700IggrabUniPk151-ran.csv",
  "SydhK562Bdp1UniPk151-ran.csv",
  "SydhK562Bhlhe40nb100IggrabUniPk151-ran.csv",
  "SydhK562Brf1UniPk151-ran.csv",
  "SydhK562Brf2UniPk151-ran.csv",
  "SydhK562Brg1IggmusUniPk151-ran.csv",
  "SydhK562Ccnt2UniPk151-ran.csv",
  "SydhK562CebpbIggrabUniPk151-ran.csv",
  "SydhK562CfosUniPk151-ran.csv",
  "SydhK562Chd2ab68301IggrabUniPk151-ran.csv",
  "SydhK562CjunIfna30UniPk151-ran.csv",
  "SydhK562CjunIfna6hUniPk151-ran.csv",
  "SydhK562CjunIfng30UniPk151-ran.csv",
  "SydhK562CjunIfng6hUniPk151-ran.csv",
  "SydhK562CjunUniPk151-ran.csv",
  "SydhK562CmycIfna30UniPk151-ran.csv",
  "SydhK562CmycIfna6hUniPk151-ran.csv",
  "SydhK562CmycIfng30UniPk151-ran.csv",
  "SydhK562CmycIfng6hUniPk151-ran.csv",
  "SydhK562CmycIggrabUniPk151-ran.csv",
  "SydhK562CmycUniPk151-ran.csv",
  "SydhK562Corestab24166IggrabUniPk151-ran.csv",
  "SydhK562Corestsc30189IggrabUniPk151-ran.csv",
  "SydhK562CtcfbIggrabUniPk151-ran.csv",
  "SydhK562E2f4UcdUniPk151-ran.csv",
  "SydhK562E2f6UcdUniPk151-ran.csv",
  "SydhK562Elk112771IggrabUniPk151-ran.csv",
  "SydhK562Gata1UcdUniPk151-ran.csv",
  "SydhK562Gata2UcdUniPk151-ran.csv",
  "SydhK562Gtf2bUniPk151-ran.csv",
  "SydhK562Gtf2f1ab28179IggrabUniPk151-ran.csv",
  "SydhK562Hmgn3UniPk151-ran.csv",
  "SydhK562Ini1IggmusUniPk151-ran.csv",
  "SydhK562Irf1Ifna30UniPk151-ran.csv",
  "SydhK562Irf1Ifna6hUniPk151-ran.csv",
  "SydhK562Irf1Ifng30UniPk151-ran.csv",
  "SydhK562Irf1Ifng6hUniPk151-ran.csv",
  "SydhK562JundIggrabUniPk151-ran.csv",
  "SydhK562Kap1UcdUniPk151-ran.csv",
  "SydhK562MaffIggrabUniPk151-ran.csv",
  "SydhK562Mafkab50322IggrabUniPk151-ran.csv",
  "SydhK562MaxIggrabUniPk151-ran.csv",
  "SydhK562Mazab85725IggrabUniPk151-ran.csv",
  "SydhK562Mxi1af4185IggrabUniPk151-ran.csv",
  "SydhK562NelfeUniPk151-ran.csv",
  "SydhK562Nfe2UniPk151-ran.csv",
  "SydhK562NfyaUniPk151-ran.csv",
  "SydhK562NfybUniPk151-ran.csv",
  "SydhK562Nrf1IggrabUniPk151-ran.csv",
  "SydhK562P300IggrabUniPk151-ran.csv",
  "SydhK562Pol2Ifna30UniPk151-ran.csv",
  "SydhK562Pol2Ifna6hUniPk151-ran.csv",
  "SydhK562Pol2Ifng30UniPk151-ran.csv",
  "SydhK562Pol2Ifng6hUniPk151-ran.csv",
  "SydhK562Pol2IggmusUniPk151-ran.csv",
  "SydhK562Pol2UniPk151-ran.csv",
  "SydhK562Pol2s2IggrabUniPk151-ran.csv",
  "SydhK562Pol2s2UniPk151-ran.csv",
  "SydhK562Pol3UniPk151-ran.csv",
  "SydhK562Rad21UniPk151-ran.csv",
  "SydhK562Rfx5IggrabUniPk151-ran.csv",
  "SydhK562Rpc155UniPk151-ran.csv",
  "SydhK562Setdb1MnasedUcdUniPk151-ran.csv",
  "SydhK562Setdb1UcdUniPk151-ran.csv",
  "SydhK562Sirt6UniPk151-ran.csv",
  "SydhK562Smc3ab9263IggrabUniPk151-ran.csv",
  "SydhK562Stat1Ifna30UniPk151-ran.csv",
  "SydhK562Stat1Ifna6hUniPk151-ran.csv",
  "SydhK562Stat1Ifng30UniPk151-ran.csv",
  "SydhK562Stat1Ifng6hUniPk151-ran.csv",
  "SydhK562Stat2Ifna30UniPk151-ran.csv",
  "SydhK562Stat2Ifna6hUniPk151-ran.csv",
  "SydhK562Tal1sc12984IggmusUniPk151-ran.csv",
  "SydhK562Tblr1ab24550IggrabUniPk151-ran.csv",
  "SydhK562Tblr1nb600270IggrabUniPk151-ran.csv",
  "SydhK562TbpIggmusUniPk151-ran.csv",
  "SydhK562Tf3c110UniPk151-ran.csv",
  "SydhK562Tr4UcdUniPk151-ran.csv",
  "SydhK562Ubfsc13125IggmusUniPk151-ran.csv",
  "SydhK562Ubtfsab1404509IggmusUniPk151-ran.csv",
  "SydhK562Usf2IggrabUniPk151-ran.csv",
  "SydhK562Yy1UcdUniPk151-ran.csv",
  "SydhK562Znf143IggrabUniPk151-ran.csv",
  "SydhK562Znf263UcdUniPk151-ran.csv",
  "SydhK562Znf274UcdUniPk151-ran.csv",
  "SydhK562Znf274m01UcdUniPk151-ran.csv",
  "SydhMcf10aesCfosEtoh01HvdUniPk151-ran.csv",
  "SydhMcf10aesCfosTam112hHvdUniPk151-ran.csv",
  "SydhMcf10aesCfosTam14hHvdUniPk151-ran.csv",
  "SydhMcf10aesCfosTamHvdUniPk151-ran.csv",
  "SydhMcf10aesCmycEtoh01HvdUniPk151-ran.csv",
  "SydhMcf10aesCmycTam14hHvdUniPk151-ran.csv",
  "SydhMcf10aesE2f4TamHvdUniPk151-ran.csv",
  "SydhMcf10aesPol2Etoh01UniPk151-ran.csv",
  "SydhMcf10aesPol2TamUniPk151-ran.csv",
  "SydhMcf10aesStat3Etoh01UniPk151-ran.csv",
  "SydhMcf10aesStat3Etoh01bUniPk151-ran.csv",
  "SydhMcf10aesStat3Etoh01cUniPk151-ran.csv",
  "SydhMcf10aesStat3Tam112hHvdUniPk151-ran.csv",
  "SydhMcf10aesStat3TamUniPk151-ran.csv",
  "SydhMcf7Gata3UcdUniPk151-ran.csv",
  "SydhMcf7Gata3sc269UcdUniPk151-ran.csv",
  "SydhMcf7Hae2f1UcdUniPk151-ran.csv",
  "SydhMcf7Tcf7l2UcdUniPk151-ran.csv",
  "SydhMcf7Znf217UcdUniPk151-ran.csv",
  "SydhNb4CmycUniPk151-ran.csv",
  "SydhNb4MaxUniPk151-ran.csv",
  "SydhNb4Pol2UniPk151-ran.csv",
  "SydhNt2d1Suz12UcdUniPk151-ran.csv",
  "SydhNt2d1Yy1UcdUniPk151-ran.csv",
  "SydhNt2d1Znf274UcdUniPk151-ran.csv",
  "SydhPanc1Tcf7l2UcdUniPk151-ran.csv",
  "SydhPbdeGata1UcdUniPk151-ran.csv",
  "SydhPbdePol2UcdUniPk151-ran.csv",
  "SydhPbdefetalGata1UcdUniPk151-ran.csv",
  "SydhRajiPol2UcdUniPk151-ran.csv",
  "SydhShsy5yGata2UcdUniPk151-ran.csv",
  "SydhShsy5yGata3sc269sc269UcdUniPk151-ran.csv",
  "SydhU2osKap1UcdUniPk151-ran.csv",
  "SydhU2osSetdb1UcdUniPk151-ran.csv",
  "UchicagoK562EfosUniPk151-ran.csv",
  "UchicagoK562Egata2UniPk151-ran.csv",
  "UchicagoK562Ehdac8UniPk151-ran.csv",
  "UchicagoK562EjunbUniPk151-ran.csv",
  "UchicagoK562EjundUniPk151-ran.csv",
  "UtaA549CtcfUniPk151-ran.csv",
  "UtaA549Pol2UniPk151-ran.csv",
  "UtaFibroblCtcfUniPk151-ran.csv",
  "UtaGlioblaCtcfUniPk151-ran.csv",
  "UtaGlioblaPol2UniPk151-ran.csv",
  "UtaGm12878CmycUniPk151-ran.csv",
  "UtaGm12878CtcfUniPk151-ran.csv",
  "UtaGm12878Pol2UniPk151-ran.csv",
  "UtaGm12891CtcfUniPk151-ran.csv",
  "UtaGm12892CtcfUniPk151-ran.csv",
  "UtaGm19238CtcfUniPk151-ran.csv",
  "UtaGm19239CtcfUniPk151-ran.csv",
  "UtaGm19240CtcfUniPk151-ran.csv",
  "UtaH1hescCmycUniPk151-ran.csv",
  "UtaH1hescCtcfUniPk151-ran.csv",
  "UtaH1hescPol2UniPk151-ran.csv",
  "UtaHelas3CmycUniPk151-ran.csv",
  "UtaHelas3CtcfUniPk151-ran.csv",
  "UtaHelas3Pol2UniPk151-ran.csv",
  "UtaHepg2CmycUniPk151-ran.csv",
  "UtaHepg2CtcfUniPk151-ran.csv",
  "UtaHepg2Pol2UniPk151-ran.csv",
  "UtaHuvecCmycUniPk151-ran.csv",
  "UtaHuvecCtcfUniPk151-ran.csv",
  "UtaHuvecPol2UniPk151-ran.csv",
  "UtaK562CmycUniPk151-ran.csv",
  "UtaK562CtcfUniPk151-ran.csv",
  "UtaK562Pol2UniPk151-ran.csv",
  "UtaMcf7CmycEstroUniPk151-ran.csv",
  "UtaMcf7CmycSerumstimUniPk151-ran.csv",
  "UtaMcf7CmycSerumstvdUniPk151-ran.csv",
  "UtaMcf7CmycVehUniPk151-ran.csv",
  "UtaMcf7CtcfEstroUniPk151-ran.csv",
  "UtaMcf7CtcfSerumstimUniPk151-ran.csv",
  "UtaMcf7CtcfSerumstvdUniPk151-ran.csv",
  "UtaMcf7CtcfUniPk151-ran.csv",
  "UtaMcf7CtcfVehUniPk151-ran.csv",
  "UtaMcf7Pol2SerumstimUniPk151-ran.csv",
  "UtaMcf7Pol2SerumstvdUniPk151-ran.csv",
  "UtaMcf7Pol2UniPk151-ran.csv",
  "UtaNhekCtcfUniPk151-ran.csv",
  "UtaProgfibCtcfUniPk151-ran.csv",
  "UtaProgfibPol2UniPk151-ran.csv",
  "UwA549CtcfUniPk151-ran.csv",
  "UwAg04449CtcfUniPk151-ran.csv",
  "UwAg04450CtcfUniPk151-ran.csv",
  "UwAg09309CtcfUniPk151-ran.csv",
  "UwAg09319CtcfUniPk151-ran.csv",
  "UwAg10803CtcfUniPk151-ran.csv",
  "UwAoafCtcfUniPk151-ran.csv",
  "UwBe2cCtcfUniPk151-ran.csv",
  "UwBjCtcfUniPk151-ran.csv",
  "UwCaco2CtcfUniPk151-ran.csv",
  "UwGm06990CtcfUniPk151-ran.csv",
  "UwGm12801CtcfUniPk151-ran.csv",
  "UwGm12864CtcfUniPk151-ran.csv",
  "UwGm12865CtcfUniPk151-ran.csv",
  "UwGm12872CtcfUniPk151-ran.csv",
  "UwGm12873CtcfUniPk151-ran.csv",
  "UwGm12874CtcfUniPk151-ran.csv",
  "UwGm12875CtcfUniPk151-ran.csv",
  "UwGm12878CtcfUniPk151-ran.csv",
  "UwHacCtcfUniPk151-ran.csv",
  "UwHaspCtcfUniPk151-ran.csv",
  "UwHbmecCtcfUniPk151-ran.csv",
  "UwHcfaaCtcfUniPk151-ran.csv",
  "UwHcmCtcfUniPk151-ran.csv",
  "UwHcpeCtcfUniPk151-ran.csv",
  "UwHct116CtcfUniPk151-ran.csv",
  "UwHeeCtcfUniPk151-ran.csv",
  "UwHek293CtcfUniPk151-ran.csv",
  "UwHelas3CtcfUniPk151-ran.csv",
  "UwHepg2CtcfUniPk151-ran.csv",
  "UwHffCtcfUniPk151-ran.csv",
  "UwHffmycCtcfUniPk151-ran.csv",
  "UwHl60CtcfUniPk151-ran.csv",
  "UwHmecCtcfUniPk151-ran.csv",
  "UwHmfCtcfUniPk151-ran.csv",
  "UwHpafCtcfUniPk151-ran.csv",
  "UwHpfCtcfUniPk151-ran.csv",
  "UwHreCtcfUniPk151-ran.csv",
  "UwHrpeCtcfUniPk151-ran.csv",
  "UwHuvecCtcfUniPk151-ran.csv",
  "UwHvmfCtcfUniPk151-ran.csv",
  "UwK562CtcfUniPk151-ran.csv",
  "UwMcf7CtcfUniPk151-ran.csv",
  "UwNb4CtcfUniPk151-ran.csv",
  "UwNhdfneoCtcfUniPk151-ran.csv",
  "UwNhekCtcfUniPk151-ran.csv",
  "UwNhlfCtcfUniPk151-ran.csv",
  "UwRptecCtcfUniPk151-ran.csv",
  "UwSaecCtcfUniPk151-ran.csv",
  "UwSknshraCtcfUniPk151-ran.csv",
  "UwWerirb1CtcfUniPk151-ran.csv",
  "UwWi38CtcfUniPk151-ran.csv"
  ]:

  print("dataset:", fname)

  dsname = path_prefix + "DNABERT_2/Datasets/tfbs/" + fname
  tfbs_dataset = pd.read_csv(dsname, sep=',')

# load pre-trained model

  model_path = path_prefix + "DNABERT_2/Output_Models/" + fname.split(".")[0]

  model_cls = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=30, padding_side="right", use_fast=True, trust_remote_code=True)

# reformat data to Hugging Face Dataset format from pandas

  def tokenize_ref(batch):
    return tokenizer(batch["ref_seq"], return_tensors="pt", padding='longest', max_length=30, truncation=True)
  def tokenize_alt(batch):
    return tokenizer(batch["alt_seq"], return_tensors="pt", padding='longest', max_length=30, truncation=True)


  Dataset_snp_seq_tok_ref = Dataset_snp_seq.map(tokenize_ref, batched=True, batch_size=None)

  Dataset_snp_seq_tok_alt = Dataset_snp_seq.map(tokenize_alt, batched=True, batch_size=None)

  dds = DatasetDict({
    'eval_ref': Dataset_snp_seq_tok_ref,
    'eval_alt': Dataset_snp_seq_tok_alt
  })

# switch to GPU

  model_cls = model_cls.to('cuda')

# load model

#  output_dir = path_prefix + 'outputs'

#  args = TrainingArguments(output_dir, learning_rate=lr, warmup_steps=warmup, fp16=True,
#    evaluation_strategy="steps", per_device_train_batch_size=train_bs, per_device_eval_batch_size=eval_bs,
#    eval_steps=eval_steps, save_steps=save_steps, logging_steps=100000, save_total_limit=save_total_limit,
#    load_best_model_at_end=True, metric_for_best_model="eval_matthews_correlation", num_train_epochs=epochs, weight_decay=0.01, report_to='none')

#  def compute_metrics(eval_preds):
#    metric = evaluate.combine(["accuracy","precision","recall","f1","matthews_correlation"])
#    logits, labels = eval_preds
#    if isinstance(logits, tuple):  # Unpack logits if it's a tuple
#      logits = logits[0]
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)

#  trainer = Trainer(model_cls, args, eval_dataset=dds['eval_ref'],
#                  tokenizer=tokenizer, compute_metrics=compute_metrics)

  trainer = Trainer(model_cls)

  eval_preds_ref = trainer.predict(dds['eval_ref'])

# predictions from alternative alleles

  eval_preds_alt = trainer.predict(dds['eval_alt'])

# find reference probabilities

  from scipy.special import softmax

  prob_predictions_ref = softmax(eval_preds_ref.predictions[0], axis=1)
  pos_prob_predictions_ref = prob_predictions_ref[:, 1]

# find alternative probabilities

  prob_predictions_alt = softmax(eval_preds_alt.predictions[0], axis=1)
  pos_prob_predictions_alt = prob_predictions_alt[:, 1]

# print probabilities into dataset

  prob_predictions_ref_dataset[fname.split(".")[0]] = pos_prob_predictions_ref

  prob_predictions_alt_dataset[fname.split(".")[0]] = pos_prob_predictions_alt

print(prob_predictions_ref_dataset)

print(prob_predictions_alt_dataset)

# Drop SNP names from datasets to subtract them

snp_names = prob_predictions_ref_dataset["TFBS dataset"]

prob_predictions_ref_dataset_nosnp = prob_predictions_ref_dataset.drop("TFBS dataset", axis=1)
prob_predictions_alt_dataset_nosnp = prob_predictions_alt_dataset.drop("TFBS dataset", axis=1)

# Find the differences between the two sets and take the absolute value

prob_predictions_diff_dataset_nosnp = prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp
prob_predictions_absdiff_dataset_nosnp = abs(prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp)

# Find max absolute difference in probabilities across TFs
                                             
max_absdiff_prob_predictions = prob_predictions_absdiff_dataset_nosnp.max(axis=1)
max_absdiff_prob_predictions_df = pd.DataFrame(columns=["snp", "max_absdiff_prob"])
max_absdiff_prob_predictions_df["snp"] = snp_names
max_absdiff_prob_predictions_df["max_absdiff_prob"] = max_absdiff_prob_predictions  

# Write results to output files                                             
                                             
output_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-results_by_variant.csv"
max_absdiff_prob_predictions_df.to_csv(output_file, index=False, sep=',')
                                             
matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_matrix.csv"
prob_predictions_absdiff_dataset_nosnp.to_csv(matrix_file, index=False, sep=',')      

# Find difference between the log odds scores of alleles

prob_predictions_logodds_dataset_nosnp = (np.log2(prob_predictions_ref_dataset_nosnp/(1-prob_predictions_ref_dataset_nosnp)))

log_matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_log_matrix.csv"
prob_predictions_logodds_dataset_nosnp.to_csv(log_matrix_file, index=False, sep=',')       

sub_mat = abs(prob_predictions_ref_dataset_nosnp - prob_predictions_alt_dataset_nosnp)
max_mat = pd.concat([prob_predictions_ref_dataset_nosnp, prob_predictions_alt_dataset_nosnp]).max(level=0)
prob_predictions_absdiff_max_dataset_nosnp = sub_mat.mul(max_mat)

absdiff_max_matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_absdiff_max_matrix.csv"
prob_predictions_absdiff_max_dataset_nosnp.to_csv(absdiff_max_matrix_file, index=False, sep=',')  

prob_predictions_logodds_ref_dataset_nosnp = (np.log2(prob_predictions_ref_dataset_nosnp/(1-prob_predictions_ref_dataset_nosnp)))
prob_predictions_logodds_alt_dataset_nosnp = (np.log2(prob_predictions_alt_dataset_nosnp/(1-prob_predictions_alt_dataset_nosnp)))
prob_predictions_logodds_diff_dataset_nosnp = abs(prob_predictions_logodds_ref_dataset_nosnp - prob_predictions_logodds_alt_dataset_nosnp)

logodds_diff_matrix_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-variant_logodds_diff_matrix.csv"
prob_predictions_logodds_diff_dataset_nosnp.to_csv(logodds_diff_matrix_file, index=False, sep=',') 

# Find max absolute difference*max(p_ref,p_alt) in probabilities across TFs

max_absdiff_max_prob_predictions = prob_predictions_absdiff_max_dataset_nosnp.max(axis=1)
max_absdiff_max_prob_predictions_df = pd.DataFrame(columns=["snp", "max_absdiff_max_prob"])
max_absdiff_max_prob_predictions_df["snp"] = snp_names
max_absdiff_max_prob_predictions_df["max_absdiff_max_prob"] = max_absdiff_max_prob_predictions

max_absdiff_max_output_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-max_absdiff_max_results_by_variant.csv"
max_absdiff_max_prob_predictions_df.to_csv(max_absdiff_max_output_file, index=False, sep=',')

# Find max abs diff of logodds in probabilities across TFs

max_absdiff_logodds_diff_prob_predictions = prob_predictions_logodds_diff_dataset_nosnp.max(axis=1)
max_absdiff_logodds_diff_prob_predictions_df = pd.DataFrame(columns=["snp", "max_absdiff_logodds_prob"])
max_absdiff_logodds_diff_prob_predictions_df["snp"] = snp_names
max_absdiff_logodds_diff_prob_predictions_df["max_absdiff_logodds_prob"] = max_absdiff_logodds_diff_prob_predictions

max_logodds_output_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-max_logodds_results_by_variant.csv"
max_absdiff_logodds_diff_prob_predictions_df.to_csv(max_logodds_output_file, index=False, sep=',')

abs_diff_plus_log_odds_scores = pd.concat([prob_predictions_absdiff_max_dataset_nosnp, prob_predictions_logodds_diff_dataset_nosnp], axis=1) 
abs_diff_plus_log_odds_scores_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-abs_diff_plus_log_odds_scores_results_by_variant.csv"
abs_diff_plus_log_odds_scores.to_csv(abs_diff_plus_log_odds_scores_file, index=False, sep=',')

abs_diff_plus_log_odds_scores_norm = abs_diff_plus_log_odds_scores.apply(lambda x: zscore(x))
abs_diff_plus_log_odds_scores_norm_file = path_prefix + "DNABERT_2/output/" + snp_test_name + "-abs_diff_plus_log_odds_scores_norm_results_by_variant.csv"
abs_diff_plus_log_odds_scores_norm.to_csv(abs_diff_plus_log_odds_scores_norm_file, index=False, sep=',')

Tests

In [None]:
prob_predictions_ref_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
prob_predictions_alt_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
sub_mat['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
max_mat['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
prob_predictions_absdiff_max_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
prob_predictions_logodds_ref_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
prob_predictions_logodds_alt_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
prob_predictions_logodds_diff_dataset_nosnp['BroadGm12878CtcfUniPk151-ran'][36:40]

In [None]:
abs_diff_plus_log_odds_scores = pd.concat([prob_predictions_absdiff_max_dataset_nosnp, prob_predictions_logodds_diff_dataset_nosnp], axis=1) 

In [None]:
abs_diff_plus_log_odds_scores[36:40]

In [None]:
abs_diff_plus_log_odds_scores_norm[36:40]

In [None]:
abs_diff_plus_log_odds_scores_norm.describe()