## This notebook goes through dataset creation for binary models

In [1]:
import os
import pandas as pd
import numpy as np
import glob
import subprocess
import yaml, os, shutil, sys
import json
from natsort import natsorted
import tensorflow as tf

In [2]:
def set_genome(genome): 
    '''choose genome file paths for chr size, fa and unmappable genome segments (optional)'''
    genome_dict = {'hg38': {'size':  '/home/amber/ref/hg38/hg38.chrom.sizes',
                              'fa':  '/home/amber/ref/hg38/hg38.fa',
                              'unmap':  '/home/amber/ref/hg38/hg38_unmap.bed'}}
            
    assert genome in genome_dict.keys(), 'Unknown genome!'
    return genome_dict[genome]
    
def write_basset_samplefile(bed_filepaths, basset_samplefile):
    print('Generating merged samplefile for the entire bedfile set')
    df = pd.DataFrame(columns =['identifier', 'file'])
    # per file, get the filename
    for b, bedfile_path in enumerate(bed_filepaths):
        # make entry in basenji samplefile
        df.loc[b] = [os.path.basename(bedfile_path).split('.b')[0], bedfile_path]
    # write to csv files
    df.to_csv(basset_samplefile, index=None, header=None, sep='\t')
    


The most relevant options here are:

| Argument | Note |
|:---|:---|
| chroms_only | if 'all' creates train, val, test, if specific chromosomes then creates test set from only those|
| input_size| input size of the genomic|
| base_dir | the output directory for the tfr files |
| bigwig_paths_pattern | regexp pattern that will collect all the bigwig files |
| bigwig_filepaths | can set this instead as a list of all the bws if don't want to use glob |
| bedfile_paths_pattern | same for bed files of genomic regions to focus on IF you want peak centered dataset |
| bed_filepaths | --.-- |
| pool_window | bin size, if set to 1 can bin later in the training |
| dilation_rate | fraction of data to include, can set to 0.1 to test pipelines|



In [3]:
input_size = 2048
base_dir = '../data/tfr_datasets/'
bedfile_paths_pattern = '../data/*/*peaks.bed'
bed_filepaths = [f for f in glob.glob(bedfile_paths_pattern) if f.endswith('bed') or f.endswith('gz')]
valid_chr = 'chr9'
test_chr = 'chr8'
genome = 'hg38'
dilation_rate = 1 

In [4]:
basset_samplefile = os.path.join(base_dir, 'basset_samplefile.csv')
write_basset_samplefile(bed_filepaths, basset_samplefile)# write pre-requisite file for the pipeline specifying bed paths

Generating merged samplefile for the entire bedfile set


In [5]:
config_path = os.path.join(base_dir, 'config.yaml')
config = {}

config['genomefile'] = set_genome(genome)

config['chroms']={'valid': valid_chr, 'test': test_chr}
config['input'] = {'downsample': dilation_rate, 'size':input_size}
config['samplefile'] = {'basset': basset_samplefile}

config['output'] = {'dir': base_dir, 
                   'prefix': 'i_%i_binary' % (config['input']['size'])}

In [6]:
with open('config.yaml', 'w') as file:
    documents = yaml.dump(config, file, default_flow_style=False)

In [8]:
! ./bed_to_tfr.sh

Peak centering
Generating bed region combined file for all TFs
Ignoring chrY +
LOADING DATA
LOADING DATA
LOADING DATA
