# Load and save dnase fold coverage data

----

In [3]:
import os, sys,time
from multiprocessing import Pool
from multiprocessing import sharedctypes
from numpy import ctypeslib
from functools import partial

import numpy as np
import pandas as pd

from scipy.stats.mstats import mquantiles

import pysam
import pyBigWig
from Bio import SeqIO

from pyDNAbinding.binding_model import DNASequence, PWMBindingModel, DNABindingModels, load_binding_models

In [9]:
#paths
ANNOT_DIR="/mnt/vdisk/data/synapse/encodeChallenge_Data/860/10214860/annotations/"
TRAIN_TSV_BASE_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/422/10035422/ChIPseq/labels/"
DNASE_IDR_PEAKS_BASE_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/251/10253251/essential_training_data/DNASE/peaks/conservative/"
DNASE_FOLD_COV_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/251/10253251/essential_training_data/DNASE/fold_coverage_wiggles/"
REF_GENOME="hg19.genome.fa"

#get all cell line names (imr90?)
ALL_CELL_LINES=[x.split('.')[1] for x in os.listdir(DNASE_FOLD_COV_DIR)]

#do not load all for testing
MAX_N_ROWS=int(1e5)

### Read the test regions

In [10]:
idx=pd.read_csv(
    ANNOT_DIR+'test_regions.blacklistfiltered.bed.gz',
    index_col=(0,1,2),
    sep='\t',nrows=MAX_N_ROWS).index

### The DNASE raw fold coverage data

- parellization made it 3x faster
    - bigwigs cannot be serialized, so i just create and close them in every process which creates an overhead.
    - probably could be faster, but there are slower stuff so it doesnt matter that much
    
- if we want to load it for many cell lines it is slower

In [15]:
#global shared result array... is there a better way?
fold_cov = sharedctypes.RawArray('d', len(idx))

def load_dnase_fold_cov(cell_line,idx,
                        n_proc=12,
                        dnase_fold_cov_dir=DNASE_FOLD_COV_DIR ):
    """Load the raw fold coverage for a t factor and cell line."""
    dnase_fold_cov_fname = dnase_fold_cov_dir + 'DNASE.'+cell_line+'.fc.signal.bigwig'
    
    #partially apply the scorer function
    part_get_dnase_fold_cov_from_region=partial(get_dnase_fold_cov_from_region,
                                                idx=idx,fn=dnase_fold_cov_fname)
    #parallel execute it
    Pool(n_proc).map(part_get_dnase_fold_cov_from_region,xrange(len(idx)))

    #return a dataframe
    return pd.DataFrame({cell_line+'_dnase_fc' : fold_cov},index=idx)

def get_dnase_fold_cov_from_region(i,idx,fn=None):
    """Get raw dnase fold coverage for a region."""
    contig,start,stop=idx[i]
    bigwig_f=pyBigWig.open(fn)
    fold_cov[i]=bigwig_f.stats(contig,start,stop)[0]
    bigwig_f.close()
    return
    
for cl in ALL_CELL_LINES:  
    print cl,
    dnase_fc_df=load_dnase_fold_cov(cl,idx)
    dnase_fc_df.to_csv('fold_cov_data/'+cl+'_dnase_fold_cov.csv')

Panc1 MCF-7 HepG2 A549 HCT116 GM12878 H1-hESC induced_pluripotent_stem_cell PC-3 IMR-90 SK-N-SH liver HeLa-S3 IMR90 K562
