In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks

In [None]:
# load the peak data
high       = pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
medium3    = pr_peaks.Condition('medium3','3HCP'       ,1.00,'gv_110_01_01_chipseq')
low        = pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
conditions = [high,medium1,medium2,medium3,low]

In [None]:
# prepare the list of chromosomes
chromosomes = ['chr%d'%i for i in xrange(1,23)]
chromosomes.append('chrX')
chromosomes = tuple(chromosomes)

# 2018-03-06 Removing background
One of the advices I got from the Lab Meeting presentation was to consider the values of the read counts in the ChIP-seq experiments **with removed background**, that is, subtracting the average number of reads per base pair in the genome. This is to make sure that at the data point corresponding to the lowest concentration - where there are less reads per base pair - the values of the ratio between H and L peak read number is not artificially depleted. I therefore want to discretize the ChIP-seq experiments and perform the corresponding analysis.

## Zerone

The first thing I want to try is Zerone. This software is optimized for data with replicates and with a control. I used the Zerone discretizer to get the reads per window of the experiments, using as an input the `T0_roberto_input` as control.

### Data parsing and access

Let's have a look at how to parse and access the data. The data files are very large because they correspond to the entire genome. I will use the numpy `genfromtxt` function to get the values into a convenient array. The ChIP-seq experiments also identify regions that belong to unknown contigs, so in the end the function will return only the values corresponding to known chromosomes.

In [None]:
# first, let's define the file names
pr_peaks_root_dir = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data'%(pr_peaks_root_dir)
for condition in conditions :
    condition.zerone_out = '%s/%s-zerone.out'%(data_dir,condition.name)

The next step is to define the functions that parse the Zerone output. In the end I want to be able to access rapidly the values in the array, so I need to provide an index. The `make_zerone_output_index` does this, providing a dictionary that has as keys the chromosome names and as values a tuple corresponding to the index of the start of the chromosome values and the end of the chromosome values.

In [None]:
def make_zerone_output_index(a,chromosome_list=None) :
    # if the chromosome names are not given, get them
    if chromosome_list is None :
        chromosome_list = np.unique(a['chr'])
    # init the dictionary and init the iteration
    c_idx = {}
    c_start = 0
    prev_c = a[0]['chr']
    # we don't sort the input array: Zerone already does this by default
    for i,b in enumerate(a) :
        this_c = b['chr']
        if this_c != prev_c :
            c_end = i-1
            c_idx[prev_c] = (c_start,c_end)
            c_start = i
            prev_c = this_c
    # the last chromosome needs to be manually added
    c_idx[this_c] = (c_start,i)
    return c_idx

In [None]:
def parse_zerone_output(fname,chromosome_list=None) :
    """
    Parses a Zerone output and returns a numpy array. The values of the numpy array
    are: chromosome, start, end, enrichment, read_1, read_2, ..., read_n, p.
    The number of `read_i` columns depends on the invocation of Zerone and cannot
    be known beforehand.
    """
    # first, we start by reading the first non-comment line in the Zerone file, to
    # determine the number of `read` columns in the file
    comment = True
    with open(fname,'r') as f :
        for line in f :
            if not line.startswith('#') :
                break
    n_readcols = len(line.split())-5
    zerone_dtype = [('chr','S256'),('start',np.int64),('end',np.int64),('enrichment',np.int32)]
    for i in range(1,n_readcols+1) :
        zerone_dtype.append(('read_%d'%(i),np.int64))
    zerone_dtype.append(('p',float))
    # now we parse the file using the `genfromtxt` function from numpy
    a = np.genfromtxt(fname,dtype=np.dtype(zerone_dtype))
    # next, we exclude the values of the array that pertain to chromosomes that are not
    # included in the chromosome list that was passed by the user (if any)
    if chromosome_list is not None :
        a = np.array([s for s in a if s['chr'] in chromosome_list])
    # now pass the array to the index maker, and return the array along with the index
    c_idx = make_zerone_output_index(a,chromosome_list)
    return a,c_idx

I test the function that parses the Zerone output.

In [None]:
%%time
zerone = zerone_out[0]
a,c_idx = parse_zerone_output(high.zerone_out,chromosome_list=chromosomes)

The execution time of this is reasonable.

I want now to write the pieces of code that will allow me to access the data quickly.

Since we have the discretized data along with the index, we can build a function that rapidly finds the index of the array corresponding to the values of the peaks that we want.

In [None]:
def findpeak(a,c_idx,peak,bin_size=300) :
    """
    Returns the values of the `a` array corresponding to the genomic coordinates
    of the `peak`. Uses the `c_idx` dictionary to rapidly calculate which are the indices
    of the `a` array that correspond to the peak
    """
    c_start,c_end = c_idx[peak['chr']]
    peak_idx_start = peak['start']//bin_size
    peak_idx_end = peak['end']//bin_size
    if peak_idx_start == peak_idx_end :
        return a[c_start+peak_idx_start]
    else :
        return a[c_start+peak_idx_start:c_start+peak_idx_end+1]

I now make a simple test to see whether the thing is working, with an example peak.

In [None]:
peak = high.peaks[740]
ca = findpeak(a,c_idx,peak)
print peak,ca

It works and it is very fast.

### Enrichment
The first question I want to ask is whether Xavi and Zerone say the same thing concerning whether the same regions of the genome are enriched or not.

First, I'll test that the peaks contained in the list of peaks (all_treated, 4HCP, etc) are also considered enriched by Zerone.

In [None]:
# I'll start with the `high` ones
for peak in high.peaks:
    p = findpeak(a,c_idx,peak)
    if (p['enrichment']==0).all() :
        print peak

So here I identified one peak that Xavi identified as enriched and Zerone didn't.

Now I proceed with a systematic study. I reset here the kernel and load all the data again into a convenient data structure.

In [None]:
for condition in conditions :
    mbt.log_message('parse_zerone_output','Parsing %s'%(condition.name))
    a,c_idx = parse_zerone_output(condition.zerone_out,chromosome_list=chromosomes)
    condition.zerone = a
    condition.c_idx = c_idx

Now that I have all the data loaded I can ask whether there are peaks in each condition that are considered non-enriched by Zerone and enriched by Xavi.

In [None]:
for condition in conditions :
    for peak in condition.peaks:
        p = findpeak(condition.zerone,condition.c_idx,peak)
        if (p['enrichment']==0).all() :
            print condition.name,peak