# This notebook goes through dataset creation for quantitative models. 


In [1]:
import sys
sys.path.append("../")
import os
import pandas as pd
import numpy as np
import glob
import subprocess
import yaml, os, shutil
import utils
import json
from natsort import natsorted
import tensorflow as tf

In [23]:
def set_genome(genome): 
    '''choose genome file paths for fasta, and optionally chr size, and unmappable genome segments'''
    genome_dict = {'hg38': {'size':  '../../genomes/GRCh38_EBV.chrom.sizes.tsv',
                              'fa':  '../../genomes/hg38.fa',
                              'unmap':  '../../genomes/GRCh38_unmap.bed'},
                  'mm9': {'size':  '../../genomes/mm9.chrom.sizes',
                          'fa':  '../../genomes/mm9.fa',
                          'unmap':  '../../genomes/mm9-blacklist.bed'}}
            
    assert genome in genome_dict.keys(), 'Unknown genome!'
    return genome_dict[genome]

def write_basenji_samplefile(bigwig_filepaths, basenji_samplefile):
    df = pd.DataFrame(columns =['index', 'identifier', 'file', 'sum_stat', 'description'])
    # per file, get the filename
    for b, bigwig_filepath in enumerate(bigwig_filepaths):
        # make entry in basenji samplefile
        df.loc[b] = [b, os.path.basename(bigwig_filepath).split('.b')[0], bigwig_filepath, 'mean', '']

    # write to csv files
    df.to_csv(basenji_samplefile, index=None, sep='\t')
    
def write_basset_samplefile(bed_filepaths, basset_samplefile):
    print('Generating merged samplefile for the entire bedfile set')
    df = pd.DataFrame(columns =['identifier', 'file'])
    # per file, get the filename
    for b, bedfile_path in enumerate(bed_filepaths):
        # make entry in basenji samplefile
        df.loc[b] = [os.path.basename(bedfile_path).split('.b')[0], bedfile_path]
    # write to csv files
    df.to_csv(basset_samplefile, index=None, header=None, sep='\t')
    


In [24]:
#ATAC experiment meta file from ENCODE
atac_file = pd.read_csv('../data/sample_metafile.tsv',sep='\t')

In [25]:
filter_df = atac_file[atac_file['Biological replicate(s)'] == "1"]
file_type_filter=((filter_df['Output type'] == 'fold change over control')|(filter_df['Output type'] == 'IDR thresholded peaks') )
file_df = filter_df[file_type_filter]

for idx,entry in file_df.iterrows():
    link = entry['File download URL']
    cell_line = entry['Biosample term name']
    cell_dir = utils.make_directory('../data/'+cell_line)

    if entry['Output type'] == 'fold change over control':
        cmd = 'wget -O ' + os.path.join(cell_dir, cell_line+'_replicate_1.bw ') + link
    if entry['Output type'] == 'IDR thresholded peaks':
        cmd = 'wget -O ' + os.path.join(cell_dir, cell_line+'_peaks.bed.gz ') + link
    print(cmd)
    subprocess.call(cmd,shell='True')
    
    cmd = 'gunzip ../data/*/*.bed.gz'
    subprocess.call(cmd,shell ='True')

Directory already exists!
wget -O ../data/Panc1/Panc1_replicate_1.bw https://www.encodeproject.org/files/ENCFF185TQC/@@download/ENCFF185TQC.bigWig
Directory already exists!
wget -O ../data/Panc1/Panc1_peaks.bed.gz https://www.encodeproject.org/files/ENCFF953NZY/@@download/ENCFF953NZY.bed.gz
Directory already exists!
wget -O ../data/PC-3/PC-3_replicate_1.bw https://www.encodeproject.org/files/ENCFF064FOF/@@download/ENCFF064FOF.bigWig
Directory already exists!
wget -O ../data/PC-3/PC-3_peaks.bed.gz https://www.encodeproject.org/files/ENCFF061YKV/@@download/ENCFF061YKV.bed.gz



The most relevant options here are:

| Argument | Note |
|:---|:---|
| chroms_only | if 'all' creates train, val, test, if specific chromosomes then creates test set from only those|
| input_size| input size of the genomic|
| base_dir | the output directory for the tfr files |
| bigwig_paths_pattern | regexp pattern that will collect all the bigwig files |
| bigwig_filepaths | can set this instead as a list of all the bws if don't want to use glob |
| bedfile_paths_pattern | same for bed files of genomic regions to focus on IF you want peak centered dataset |
| bed_filepaths | --.-- |
| pool_window | bin size, if set to 1 can bin later in the training |
| dilation_rate | fraction of data to include, can set to 0.1 to test pipelines|



In [32]:
input_size = 3072
base_dir = '../data/tfr_datasets'
paths_pattern = '../data/*/*'
bigwig_filepaths = [f for f in glob.glob(paths_pattern) if f.endswith('bigwig') or f.endswith('bw')]
bed_filepaths = [f for f in glob.glob(paths_pattern) if f.endswith('bed') or f.endswith('gz')]

# optional arguments
pool_window = 1 
dilation_rate = 0.1 
valid_chr = 'chr9'
test_chr = 'chr8'  #*********** <------ ********* change this to comma separated list of chr you want to include
genome = 'hg38'
chroms_only = 'all'
norm = 'none'
threshold = 0 # threshold for train and val
test_threshold = 0 # test threshold
# only change these if you want overlapping inputs
step = 0
stride_test = 1 #e.g. set to 0.333333333333 to move by 1K if 3K input
padding = 'none'
utils.make_directory(base_dir) # create dir if not there already
# basset_samplefile = os.path.join(base_dir, 'basset_samplefile.csv') # change to random for thresholded
basset_samplefile = 'random'
basenji_samplefile = os.path.join(base_dir, 'basenji_samplefile.csv')


Making directory: ../data/tfr_datasets


In [33]:
write_basenji_samplefile(bigwig_filepaths, basenji_samplefile) # write pre-requisite file for the pipeline specifying bw paths

if basset_samplefile != 'random':
    write_basset_samplefile(bed_filepaths, basset_samplefile)# write pre-requisite file for the pipeline specifying bed paths

In [34]:
config_path = os.path.join(base_dir, 'config.yaml')
config = {}

config['genomefile'] = set_genome(genome)

config['chroms']={'valid': valid_chr, 'test': test_chr, 'only': chroms_only}
config['input'] = {'downsample': dilation_rate, 'size':input_size, 'pool':pool_window, 
                   'norm': norm, 'step':step, 'padding':padding}
config['samplefile'] = {'basset': basset_samplefile, 'basenji':basenji_samplefile}

config['threshold'] = threshold
config['test_threshold'] = test_threshold
config['output'] = {'dir': base_dir, 
                   'prefix': 'i_%i_w_%i' % (config['input']['size'], config['input']['pool'])}
config['stride_test'] = stride_test 

In [35]:
with open('config.yaml', 'w') as file:
    documents = yaml.dump(config, file, default_flow_style=False)

In [36]:
! ./bw_to_tfr.sh

random
Chopping randomly
['../../genomes/hg38.fa', '../data/tfr_datasets/basenji_samplefile.csv']
Using test set threshold of 0.0
stride_train 1 converted to 3072.000000
stride_test 1 converted to 3072.000000
CHR
Contigs divided into
 Train:   545 contigs, 2529907462 nt (0.9007)
 Valid:    29 contigs,  135962071 nt (0.0484)
 Test:     34 contigs,  143066987 nt (0.0509)
<generator object divide_contigs_chr.<locals>.<genexpr> at 0x7fd060d39318>
./basenji_data_read.py --crop 0 -w 1 -u mean -s 1.000000 --norm none --padding none ../data/PC-3/PC-3_replicate_1.bw ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov/0.h5
./basenji_data_read.py --crop 0 -w 1 -u mean -s 1.000000 --norm none --padding none ../data/Panc1/Panc1_replicate_1.bw ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov/1.h5
./basenji_data_write.py -s 0 -e 256 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/

*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
236
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
247
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
242
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
247
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
240
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
./basenji_data_write.py -s 4864 -e 5120 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-19.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 5120 -e 5376 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/seque

*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
  values = values.flatten().tostring()
*
*
*
*
*
242
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
237
*
*
*
*
*
  values = values.flatten().tostring()
./basenji_data_write.py -s 9728 -e 9984 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-38.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.0

*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
232
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
250
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
247
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
240
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tos

*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
247
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
248
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
242
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
245
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
238
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
  values = values.flatten().tos

*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
243
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
239
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
240
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
239
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
249
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
248
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
241
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
242
*
*
*
*
*
  values = values.flatten().tos

./basenji_data_write.py -s 28672 -e 28928 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-112.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 28928 -e 29184 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-113.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
246
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
247
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
249
*
*
*
*
*
*
*
*
*
*
244
*
*
*
*
*
*
*
*
*
*
244
*
*
*
*
*
  values = values.flatten().tostring()
  values = values.flatten().tostring()
  values = values.flatten().t

./basenji_data_write.py -s 33024 -e 33280 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-129.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 33280 -e 33536 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-130.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 33536 -e 33792 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-131.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 33792 -e 34048 --umap_cli

./basenji_data_write.py -s 37376 -e 37632 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-146.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 37632 -e 37888 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-147.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 37888 -e 38144 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-148.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 38144 -e 38400 --umap_cli

./basenji_data_write.py -s 41984 -e 42240 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-164.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 42240 -e 42496 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-165.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 42496 -e 42752 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-166.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 42752 -e 43008 --umap_cli

./basenji_data_write.py -s 46592 -e 46848 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-182.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 46848 -e 47104 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-183.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 47104 -e 47360 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-184.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 47360 -e 47616 --umap_cli

./basenji_data_write.py -s 51712 -e 51968 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-202.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 51968 -e 52224 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-203.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 52224 -e 52480 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-204.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 52480 -e 52736 --umap_cli

./basenji_data_write.py -s 56320 -e 56576 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-220.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 56576 -e 56832 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-221.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 56832 -e 57088 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-222.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 57088 -e 57344 --umap_cli

./basenji_data_write.py -s 60672 -e 60928 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-237.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 60928 -e 61184 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-238.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 61184 -e 61440 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-239.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 61440 -e 61696 --umap_cli

./basenji_data_write.py -s 65024 -e 65280 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-254.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 65280 -e 65536 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-255.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 65536 -e 65792 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-256.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 65792 -e 66048 --umap_cli

./basenji_data_write.py -s 69632 -e 69888 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-272.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 69888 -e 70144 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-273.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 70144 -e 70400 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-274.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 70400 -e 70656 --umap_cli

./basenji_data_write.py -s 74496 -e 74752 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-291.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 74752 -e 75008 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-292.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 75008 -e 75264 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-293.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 75264 -e 75520 --umap_cli

./basenji_data_write.py -s 79104 -e 79360 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-309.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 79360 -e 79616 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-310.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 79616 -e 79872 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/train-311.tfr train -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 79872 -e 80128 --umap_cli

*
*
*
*
*
207
*
*
*
*
*
  values = values.flatten().tostring()
./basenji_data_write.py -s 83606 -e 83862 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/valid-5.tfr valid -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 83862 -e 84118 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/valid-6.tfr valid -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 84118 -e 84374 --umap_clip 1.000000 ../../genomes/hg38.fa ../data/tfr_datasets/i_3072_w_1/sequences.bed ../data/tfr_datasets/i_3072_w_1/seqs_cov ../data/tfr_datasets/i_3072_w_1/tfrecords/valid-7.tfr valid -o ../data/tfr_datasets/i_3072_w_1 --threshold 0.000000 --test_threshold 0.00

In [44]:
! head ../data/tfr_datasets/basenji_samplefile.csv

index	identifier	file	sum_stat	description
0	PC-3_replicate_1	../data/PC-3/PC-3_replicate_1.bw	mean	
1	Panc1_replicate_1	../data/Panc1/Panc1_replicate_1.bw	mean	


In [45]:
# sanity check for dataset splits
! cut -f4 ../data/tfr_datasets/i_3072_w_1/sequences.bed | sort | uniq -c

   4655 test
  82326 train
   4424 valid


In [46]:
# summary of the configs corresponding to the new dataset
! cat ../data/tfr_datasets/i_3072_w_1/config.yaml

chroms:
  only: all
  test: chr8
  valid: chr9
genomefile:
  fa: ../../genomes/hg38.fa
  size: ../../genomes/GRCh38_EBV.chrom.sizes.tsv
  unmap: ../../genomes/GRCh38_unmap.bed
input:
  downsample: 0.1
  norm: none
  padding: none
  pool: 1
  size: 3072
  step: 0
output:
  dir: ../data/tfr_datasets
  prefix: i_3072_w_1
samplefile:
  basenji: ../data/tfr_datasets/basenji_samplefile.csv
  basset: random
stride_test: 1
test_threshold: 0
threshold: 0


# tfr ---> ground truth, predictions ---> bws

In [None]:
best_model = 'BEST_MODEL_EVER'
model, bin_size = tfr_evaluate.read_model(best_model)

In [None]:
testset, targets = tfr_evaluate.collect_whole_testset(data_dir='ORDERED_TESTSET', 
                                                      coords=True)

In [None]:
C, X, Y = util.convert_tfr_to_np(testset, 3)

In [None]:
preds = embed.predict_np(X, model, batch_size=32, reshape_to_2D=False)

In [None]:
targets_i = 0 #which target or cell line to write 

In [None]:
out_dir = util.make_dir('OUTPUT_DIR')
pred_bw_filename = os.path.join(out_dir, 'pred_{}.bw'.format(targets[targets_i]))
truth_bw_filename = os.path.join(out_dir, 'truth_{}.bw'.format(targets[targets_i]))

In [None]:
# open 2 bws
chrom_size_path='GENOME_SIZE_PATH'
pred_bw = test_to_bw_fast.open_bw(pred_bw_filename, chrom_size_path)
truth_bw = test_to_bw_fast.open_bw(truth_bw_filename, chrom_size_path)

In [None]:
clean_C = [str(c).strip('b\'').strip('\'') for c in C]

In [None]:

for i in range(preds.shape[0]): # per data point 
    chrom, start, end = clean_C[i].split('_') # get chr, start, end
    start = int(start) # to feed into bw making function
    # for cell_line in [8]: # per cell line
    for t in [targets_i]: # per cell line
        # write to ground truth file
        truth_bw.addEntries(chrom, start,
            values=np.array(np.squeeze(Y[i,:,t]), dtype='float64'),
            span=1, step=1)
        # write to prediction bw file
        pred_bw.addEntries(chrom, start,
            values=np.array(np.squeeze(preds[i,:,t]), dtype='float64'),
            span=bin_size, step=bin_size)

In [None]:
truth_bw.close()
pred_bw.close()