In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pr_peaks
from scipy.stats import gaussian_kde

# 2018-02-23 Are you sure?
I think I have to be super-sure that there really is this non-monotonic behaviour that is driving me nuts.

In [None]:
high       = pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
medium3    = pr_peaks.Condition('medium3','3HCP'       ,1.00,'gv_110_01_01_chipseq')
low        = pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

**NOTE**: I don't have the list of the 2HCP peaks. However, this for the moment doesn't matter, because I don't need to perform the averages on that population. In case, I'll define the ratios for the M peaks, as the one corresponding to the 3HCP case.

Now I define the peaks, so that I can perform the averages easily.

In [None]:
Hpeaks = high.peaks
Mpeaks = medium2.peaks
Lpeaks = low.peaks

In [None]:
# calculate all the values
conditions = [high,medium1,medium2,medium3,low]
nconditions = len(conditions)
concentrations = [condition.concentration for condition in conditions]
H_to_L = np.zeros(nconditions)
M_to_L = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    condition.avH  = pr_peaks.average_peak_counts(Hpeaks,condition)
    condition.avM  = pr_peaks.average_peak_counts(Mpeaks,condition)
    condition.avL  = pr_peaks.average_peak_counts(Lpeaks,condition)
    H_to_L[i] = condition.avH/condition.avL
    M_to_L[i] = condition.avM/condition.avL

In [None]:
H_to_M = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    H_to_M[i] = condition.avH/condition.avM

In [None]:
plt.semilogx(concentrations,H_to_L,'o--',label='H to L')
plt.semilogx(concentrations,H_to_M,'v--',label='H to M')
plt.semilogx(concentrations,M_to_L,'^--',label='M to L')
plt.axhline(y=1,linestyle='--',linewidth=0.75,color='k')
plt.xlabel('Concentration [nM]')
plt.ylabel('Ratio')
plt.legend(loc='upper right')
plt.show()

In [None]:
avH = [condition.avH for condition in conditions]
avM = [condition.avM for condition in conditions]
avL = [condition.avL for condition in conditions]

In [None]:
plt.loglog(concentrations,avH,'o--',linewidth=1,label='High')
plt.loglog(concentrations,avM,'^--',linewidth=1,label='Medium')
plt.loglog(concentrations,avL,'x--',linewidth=1,label='Low')
plt.xlabel('Concentration [nM]')
plt.ylabel('Number of reads')
plt.legend(loc='lower right')
plt.show()

Let's make sure that there is no important effect due to the peak size.

In [None]:
Lpeaksize = np.zeros(len(Lpeaks),dtype=np.int32)
Mpeaksize = np.zeros(len(Mpeaks),dtype=np.int32)
Hpeaksize = np.zeros(len(Hpeaks),dtype=np.int32)
for i,peak in enumerate(Lpeaks) :
    Lpeaksize[i] = peak['end']-peak['start']
for i,peak in enumerate(Mpeaks) :
    Mpeaksize[i] = peak['end']-peak['start']
for i,peak in enumerate(Hpeaks) :
    Hpeaksize[i] = peak['end']-peak['start']    

In [None]:
Lsize_k = gaussian_kde(Lpeaksize)
Msize_k = gaussian_kde(Mpeaksize)
Hsize_k = gaussian_kde(Hpeaksize)

In [None]:
x = np.arange(150,1500)
plt.plot(x,Hsize_k(x),label='High')
plt.plot(x,Msize_k(x),label='Medium')
plt.plot(x,Lsize_k(x),label='Low')
plt.legend(loc='upper right')
plt.xlabel('Peak size')
plt.ylabel('Distribution')
plt.show()

So there is indeed a significant difference in the size of the peaks. So now I do a basic thing: normalize the count of each peak per base pair.

In [None]:
def average_peak_counts_normalized(peaks,condition) :
    npeaks = peaks.size
    pcounts = np.zeros(npeaks)
    for i,peak in enumerate(peaks) :
        pcounts[i] = condition.peak_counts(peak)/float(peak['end']-peak['start'])
    return pcounts.mean()

In [None]:
# calculate all the values
conditions = [high,medium1,medium2,medium3,low]
nconditions = len(conditions)
concentrations = [condition.concentration for condition in conditions]
H_to_L_norm = np.zeros(nconditions)
M_to_L_norm = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    condition.avH_norm  = average_peak_counts_normalized(Hpeaks,condition)
    condition.avM_norm  = average_peak_counts_normalized(Mpeaks,condition)
    condition.avL_norm  = average_peak_counts_normalized(Lpeaks,condition)
    H_to_L_norm[i] = condition.avH_norm/condition.avL_norm
    M_to_L_norm[i] = condition.avM_norm/condition.avL_norm

In [None]:
avH_norm = [condition.avH_norm for condition in conditions]
avM_norm = [condition.avM_norm for condition in conditions]
avL_norm = [condition.avL_norm for condition in conditions]

In [None]:
plt.semilogx(concentrations,avH_norm,'o--',linewidth=1,label='High')
plt.semilogx(concentrations,avM_norm,'^--',linewidth=1,label='Medium')
plt.semilogx(concentrations,avL_norm,'x--',linewidth=1,label='Low')
plt.xlabel('Concentration [nM]')
plt.ylabel('Number of reads (normalized)')
plt.legend(loc='lower right')
plt.show()

In [None]:
plt.semilogx(concentrations,H_to_L_norm,'o--',label='H to L')
plt.semilogx(concentrations,M_to_L_norm,'^--',label='M to L')
plt.axhline(y=1,linestyle='--',linewidth=0.75,color='k')
plt.xlabel('Concentration [nM]')
plt.ylabel('Ratio (normalized)')
plt.legend(loc='upper right')
plt.show()

So even normalizing the number of reads per base pair, I still obtain this lovely maximum of the h-ratio enhancement. What about the GC content of the sequences? Could that reflect PCR biases?