# Load and save motif data

----

In [1]:
import os, sys,time
from multiprocessing import Pool
from multiprocessing import sharedctypes
from numpy import ctypeslib
from functools import partial

import numpy as np
import pandas as pd

from scipy.stats.mstats import mquantiles

import pysam
import pyBigWig
from Bio import SeqIO

from pyDNAbinding.binding_model import DNASequence, PWMBindingModel, DNABindingModels, load_binding_models

In [2]:
#paths
ANNOT_DIR="/mnt/vdisk/data/synapse/encodeChallenge_Data/860/10214860/annotations/"
TRAIN_TSV_BASE_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/422/10035422/ChIPseq/labels/"
DNASE_IDR_PEAKS_BASE_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/251/10253251/essential_training_data/DNASE/peaks/conservative/"
DNASE_FOLD_COV_DIR = "/mnt/vdisk/data/synapse/encodeChallenge_Data/251/10253251/essential_training_data/DNASE/fold_coverage_wiggles/"
REF_GENOME="hg19.genome.fa"

#get all cell line names (imr90?)
ALL_CELL_LINES=[x.split('.')[1] for x in os.listdir(DNASE_FOLD_COV_DIR)]

#do not load all for testing
#MAX_N_ROWS=int(1e4)

In [3]:
idx=pd.read_csv(
    ANNOT_DIR+'test_regions.blacklistfiltered.bed.gz',
    index_col=(0,1,2),
    sep='\t').index

In [4]:
print len(idx)

60519746


### The DNASE raw fold coverage data

- parellization made it 3x faster
    - bigwigs cannot be serialized, so i just create and close them in every process which creates an overhead.
    - probably could be faster, but there are slower stuff so it doesnt matter that much
    
- if we want to load it for many cell lines it is slower

### Load the motif scores

- genome need to be read only once its like 2 minutes, i do it before once

- parallelization made ~10x faster, still long time
    - genome gets copied into every process but still faster than pysam which is disk reading based

In [5]:
start=time.time()
with open(REF_GENOME, "rU") as h:
    ref_genome=SeqIO.to_dict(SeqIO.parse(h, "fasta"))
print int(time.time()-start),'s'

47 s


In [6]:
#the aggregate colnames
aggregate_region_scores_labels = ["motif_mean", "motif_max", "motif_q99",
                                  "motif_q95", "motif_q90", "motif_q75", "motif_q50"]
def aggregate_region_scores(scores,
                            quantile_probs = [0.99, 0.95, 0.90, 0.75, 0.50]):
    """Return aggregate scores of all scores from a region."""
    rv = [scores.mean()/len(scores), scores.max()]
    rv.extend(mquantiles(scores, prob=quantile_probs))
    return rv

#global shared result array... is there a better way?
shape=(len(idx),len(aggregate_region_scores_labels))
motif_scores = sharedctypes.RawArray('d', shape[0]*shape[1])

def load_motif_scores(t_factor,
                      n_proc=12):
    """Load aggregate motif scores for a t_factor for intervals in the index."""
    #load binding models
    binding_models = load_binding_models("models.yaml")
    model = binding_models.get_from_tfname(t_factor)
    
    #partially apply the scorer function
    part_get_motif_scores_from_region=partial(
        get_motif_scores_from_region,model=model)
    
    #parrallel execute it
    Pool(n_proc).map(part_get_motif_scores_from_region,xrange(len(idx)))        
    
    #shared array to numpy array
    ms=ctypeslib.as_array(motif_scores).reshape(shape)
    
    colnames=[t_factor+'_'+x for x in aggregate_region_scores_labels]
    return pd.DataFrame(ms,index=idx,columns=colnames)
    
def get_motif_scores_from_region(i,model):
    """Get motif scores from a region in index."""
    contig, start, stop = idx[i]
    #load seq from genome
    seq=str(ref_genome[contig][start:stop].seq).upper()
    #pysam is a bit slower
    #genome = pysam.FastaFile(REF_GENOME)
    #seq=genome.fetch(contig, start, stop+1).upper()
    
    #make sharred array numpy array
    ms = ctypeslib.as_array(motif_scores).reshape(shape)
    
    #get aggregate motif scores
    ms[i] = aggregate_region_scores(
        DNASequence(seq).score_binding_sites(model[0], 'MAX'))
    
    return

In [None]:
#transcription factors for the final submissions
ALL_TF=['ATF2','CTCF','E2F1','EGR1','FOXA1','FOXA2','HNF4A','JUND','MAX','NANOG','REST','TAF1']

for tf in ALL_TF:
    print tf,
    motif_df=load_motif_scores(tf)
    motif_df.to_csv('motif_data/'+tf+'_motif.csv')

ATF2