In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pyBigWig
import os

# 2018-02-16 Height of peaks
In my previous analysis of the "chair model", I concluded that for sure for the case of uniform transition probability between the chairs, there is a clear effect: the ratio between the occupancy of the comfortable chairs and the low-comfort chairs decreases with increasing concentration (number) of people claiming the chairs. Now, can this effect be recovered looking at the peaks of the ChIP-seq experiments at different concentrations?

I'll try to look at whether this is the case by looking at the data produced by Roser and Guille and looking directly at the _number of reads corresponding to each region_. The hope is that despite having a different number of reads in total, there will be a clear effect due to the fact that there is this "volume exclusion" effect.

## Preliminaries: code base

Here I copy/paste some of the code that I used before to analyze the peaks of the PR titration experiments.

In [None]:
def load_peaks (peaks_id,xavi_datadir='/mnt/xavi') :
    datadir = '%s/projects/gvicent/2017-01-23_characterisation_prbs_r5020_titration/tables'%(xavi_datadir)
    datafile = '%s/genomic_coordinates_by_peak_population_%s.bed'%(datadir,peaks_id)
    return mbt.parse_simple_bed(datafile)

In [None]:
def bw_location (sample_id,xavi_datadir='/mnt/xavi/data') :
    # build the directory name where the files are
    d = "%s/chipseq/samples/%s/peaks"%(xavi_datadir,sample_id)
    # select all files that end with ".bw" in the directory, and
    # then prefer to read the one that is in the directory that has
    # "with_control"
    peakfiles = []
    for root,sub,files in os.walk(d) :
        for f in files :k
            if f.endswith (".bw") :
                peakfiles.append('%s/%s'%(root,f))
    fin = None
    for peakfile in peakfiles :
        if 'with_control' in peakfile :
            fin = peakfile
            break
        else :
            fin = peakfile
    if fin is None :
        warn_message('bw_location','Data not found for %s'%sample_id)
    return fin

In [None]:
class Condition :
    def __init__(self,name,peak_code,concentration,sample_id) :
        self.name = name
        self.peak_code = peak_code
        self.concentration = concentration
        self.sample_id = sample_id
        # load the peaks
        self.peaks = load_peaks(self.peak_code)
        # init the BigWig file
        self.bw_file = bw_location(sample_id)
        # init the BigWig parser
        self.bw = pyBigWig.open(self.bw_file)
    def peak_counts(self,peak) :
        chromosome,start,end = peak
        # use the BigWig parser to get the stats of the peak
        return self.bw.stats(chromosome,start,end)
    def __del__(self) :
        self.bw.close()

In [None]:
def average_peak_counts(peaks,condition) :
    npeaks = peaks.size
    pcounts = np.zeros(npeaks)
    for i,peak in enumerate(peaks) :
        pcounts[i] = condition.peak_counts(peak)[0]
    pcounts[np.isnan(pcounts)] = 0.0
    return pcounts.mean()

## Analysis
Now that I have a good code base ready, I can start the analysis.

In [None]:
# use my lovely "Condition" class to get my data nicely packed into convenient data structures
high       = Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
low        = Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
high.avH  = average_peak_counts(high.peaks,high)
high.avM1 = average_peak_counts(medium1.peaks,high)
high.avM2 = average_peak_counts(medium2.peaks,high)
high.avL  = average_peak_counts(low.peaks,high)

In [None]:
medium1.avH  = average_peak_counts(high.peaks,medium1)
medium1.avM1 = average_peak_counts(medium1.peaks,medium1)
medium1.avM2 = average_peak_counts(medium2.peaks,medium1)
medium1.avL  = average_peak_counts(low.peaks,medium1)

In [None]:
medium2.avH  = average_peak_counts(high.peaks,medium2)
medium2.avM1 = average_peak_counts(medium1.peaks,medium2)
medium2.avM2 = average_peak_counts(medium2.peaks,medium2)
medium2.avL  = average_peak_counts(low.peaks,medium2)

In [None]:
low.avH  = average_peak_counts(high.peaks,low)
low.avM1 = average_peak_counts(medium1.peaks,low)
low.avM2 = average_peak_counts(medium2.peaks,low)
low.avL  = average_peak_counts(low.peaks,low)

In [None]:
print "           avH     avM1    avM2     avL"
print "High    :   %.3f    %.3f    %.3f     %.3f"%(high.avH,high.avM1,high.avM2,high.avL)
print "Medium1 :   %.3f    %.3f    %.3f     %.3f"%(medium1.avH,medium1.avM1,medium1.avM2,medium1.avL)
print "Medium2 :   %.3f    %.3f    %.3f     %.3f"%(medium2.avH,medium2.avM1,medium2.avM2,medium2.avL)
print "Low     :   %.3f    %.3f    %.3f     %.3f"%(low.avH,low.avM1,low.avM2,low.avL)

In [None]:
conditions = [high,medium1,medium2,low]

In [None]:
nconditions = len(conditions)
H_to_L = np.zeros(nconditions)
M1_to_L = np.zeros(nconditions)
M2_to_L = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    H_to_L[i] = condition.avH/condition.avL
    M1_to_L[i] = condition.avM1/condition.avL
    M2_to_L[i] = condition.avM2/condition.avL

In [None]:
concentrations = [condition.concentration for condition in conditions]
plt.loglog(concentrations,H_to_L,label='High')
plt.loglog(concentrations,M1_to_L,label='Medium1')
plt.loglog(concentrations,M2_to_L,label='Medium2')
plt.legend(loc='upper right')
plt.show()

This analysis shows that for the "medium1" and "medium2" population there is a non-monotonic dependency of the ratio H/M on the concentration. This is rather promising, in the sense that this demonstrates, if correct, that one cannot explain this data on the basis of a simple model in which the transition probabilities are uniform.

The next step is to try to see whether this is not an artefact of having considered the values in the BigWig file. That is, let's see whether there is the same effect simply by looking at the number of counts in a region.

## Coda: recovering the 2HCP

In the data folder that Xavi gave me I don't see the 2HCP data file. However, I see a file that is named something like "union_allpeaks" so I try here to take that file, and do a diff with the union of all the other peaks to recover the 2HCP.

In [None]:
# load the 'union_all_samples' data file
allpeaks = load_peaks('union_all_samples')

In [None]:
print high.peaks.size + medium1.peaks.size + medium2.peaks.size + low.peaks.size
print allpeaks.size
print allpeaks.size - (high.peaks.size + medium1.peaks.size + medium2.peaks.size + low.peaks.size)

The numbers don't quite add up, so I should ask Guille, Roser and Xavi where are the list of 2HCP peaks.