In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pysam
import os
import pr_peaks

# 2018-02-19 Number of reads analysis
I want to perform the same analysis as before, but simply looking at the number of reads corresponding to each of the peaks.

The pieces of code below are just copy/pasted from my previous notebook.

In [None]:
def chipseq_bam_location (sample_id,xavi_datadir='/mnt/xavi/data') :
    # build the directory name where the files are
    d = "%s/chipseq/samples/%s/alignments"%(xavi_datadir,sample_id)
    # select all files that end with ".bw" in the directory, and
    # then prefer to read the one that is in the directory that has
    # "with_control"
    peakfiles = []
    for root,sub,files in os.walk(d) :
        for f in files :
            if f.endswith (".bam") :
                peakfiles.append('%s/%s'%(root,f))
    fin = None
    for peakfile in peakfiles :
        if 'with_control' in peakfile :
            fin = peakfile
            break
        else :
            fin = peakfile
    if fin is None :
        warn_message('chipseq_bam_location','Data not found for %s'%sample_id)
    return fin

In [None]:
class Condition :
    def __init__(self,name,peak_code,concentration,sample_id) :
        self.name = name
        self.peak_code = peak_code
        self.concentration = concentration
        self.sample_id = sample_id
        # load the peaks
        self.peaks = pr_peaks.load_hcp_peaks(self.peak_code)
        # init the BAM file
        self.bam_file = chipseq_bam_location(sample_id)
        # init the pysam parser
        self.bam = pysam.AlignmentFile(self.bam_file)
    def peak_counts(self,peak) :
        chromosome,start,end = peak
        chromosome = str(chromosome)
        # use the BigWig parser to get the stats of the peak
        return self.bam.count(chromosome,start,end)
    def __del__(self) :
        self.bam.close()

In [None]:
def average_peak_counts(peaks,condition) :
    npeaks = peaks.size
    pcounts = np.zeros(npeaks)
    for i,peak in enumerate(peaks) :
        pcounts[i] = condition.peak_counts(peak)
    return pcounts.mean()

In [None]:
# use my lovely "Condition" class to get my data nicely packed into convenient data structures
high       = Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
low        = Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
high.avH  = average_peak_counts(high.peaks,high)
high.avM1 = average_peak_counts(medium1.peaks,high)
high.avM2 = average_peak_counts(medium2.peaks,high)
high.avL  = average_peak_counts(low.peaks,high)

In [None]:
medium1.avH  = average_peak_counts(high.peaks,medium1)
medium1.avM1 = average_peak_counts(medium1.peaks,medium1)
medium1.avM2 = average_peak_counts(medium2.peaks,medium1)
medium1.avL  = average_peak_counts(low.peaks,medium1)

In [None]:
medium2.avH  = average_peak_counts(high.peaks,medium2)
medium2.avM1 = average_peak_counts(medium1.peaks,medium2)
medium2.avM2 = average_peak_counts(medium2.peaks,medium2)
medium2.avL  = average_peak_counts(low.peaks,medium2)

In [None]:
low.avH  = average_peak_counts(high.peaks,low)
low.avM1 = average_peak_counts(medium1.peaks,low)
low.avM2 = average_peak_counts(medium2.peaks,low)
low.avL  = average_peak_counts(low.peaks,low)

In [None]:
print "           avH     avM1    avM2     avL"
print "High    :   %.3f    %.3f    %.3f     %.3f"%(high.avH,high.avM1,high.avM2,high.avL)
print "Medium1 :   %.3f    %.3f    %.3f     %.3f"%(medium1.avH,medium1.avM1,medium1.avM2,medium1.avL)
print "Medium2 :   %.3f    %.3f    %.3f     %.3f"%(medium2.avH,medium2.avM1,medium2.avM2,medium2.avL)
print "Low     :   %.3f    %.3f    %.3f     %.3f"%(low.avH,low.avM1,low.avM2,low.avL)

In [None]:
conditions = [high,medium1,medium2,low]

In [None]:
nconditions = len(conditions)
H_to_L = np.zeros(nconditions)
M1_to_L = np.zeros(nconditions)
M2_to_L = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    H_to_L[i] = condition.avH/condition.avL
    M1_to_L[i] = condition.avM1/condition.avL
    M2_to_L[i] = condition.avM2/condition.avL

In [None]:
concentrations = [condition.concentration for condition in conditions]
plt.loglog(concentrations,H_to_L,label='High')
plt.loglog(concentrations,M1_to_L,label='Medium1')
plt.loglog(concentrations,M2_to_L,label='Medium2')
plt.legend(loc='upper right')
plt.show()

The results of this analysis, by using the ratio between the number of reads instead of the peak quality, shows that there is an interesting non-monotonic dependence of the ratio on the concentration.

## A model explanation

Let's think for a moment that this effect is real. I want to look at possible reasons why this can be the case. I'll turn back to the chair model and look at non-uniform transition matrices.

### Array init

In [None]:
# init constants of the system
N = 500                                   # number of equivalent systems
n = 340                                   # number of sites in each system

In [None]:
Msites = np.arange(0,n,10)
Hsites = np.arange(5,n,10)
site_taus = np.ones(n)
site_taus[Msites] = 5.0
site_taus[Hsites] = 20.0
# plot it
fig = plt.figure(figsize=(10,3))
x = np.arange(n)
plt.bar(x,site_taus[x])
plt.xlabel("Site index")
plt.ylabel(r"$\tau$")
plt.show()

In [None]:
Lsites = range(n)
for site in Hsites :
    Lsites.remove(site)
for site in Msites :
    Lsites.remove(site)

Now I init the transition matrices: I'll init the uniform matrix so I can have a comparison reference result. Before doing that I want to add another piece to my codebase: a class to neatly contain all the tests I'll do with a given transition matrix hypothesis.

In [None]:
class JumpingModel :
    def __init__ (self,T,site_taus) :
        self.T = T
        self.site_taus = site_taus
        self.omega_t = {}
        self.occupancy = {}
    def run(self,nsteps,mu,sigma,omega_t_initial) :
        self.omega_t[mu] = pr_peaks.run_chair_simulation(nsteps,
                                                         omega_t_initial,
                                                         self.T,
                                                         self.site_taus)
        self.occupancy[mu] = self.omega_t[mu].sum(axis=0)

### Flat transition matrix

In [None]:
nsteps = 1000
sigma = None
mus = [1,2,5,10,20]

In [None]:
%%time
Tflat = np.ones((n,n))/n
flat = JumpingModel(Tflat,site_taus)
for mu in mus :
    print "Mu = %d"%(mu)
    omega_t_initial = pr_peaks.init_omega_t(N,n,mu,sigma)
    flat.run(nsteps,mu,sigma,omega_t_initial)

### HL : high-to-low loops
I want to see now what happens if I introduce a link between every H site and an L site (picked randomly).

The idea here is that if there is more probability that once an H site is free, there will immediately be a searcher jumping to it, an effect which will be more evident when there are more searchers present in the system.

In [None]:
# prepare the transition matrix
THL = np.ones((n,n))
nH = len(Hsites)
target_L_sites = np.random.choice(Lsites,size=nH)
for i in range(nH) :
    i1 = Hsites[i]
    i2 = target_L_sites[i]
    # for i2 in Lsites :
    THL[i1,i2] += 1
    THL[i2,i1] += 1
THL = mbt.row_normalize_matrix(THL)

In [None]:
%%time
HL = JumpingModel(THL,site_taus)
for mu in mus :
    print "Mu = %d"%(mu)
    omega_t_initial = pr_peaks.init_omega_t(N,n,mu,sigma)
    HL.run(nsteps,mu,sigma,omega_t_initial)

In [None]:
fig,axarr = plt.subplots(len(mus),2,figsize=(15,5))
x = np.arange(n)
show_xaxis=False
for i,mu in enumerate(mus) :
    # with flat transition probability
    ax = axarr[i,0]
    if i==len(mus)-1 :
        show_xaxis=True
    mbt.line_plot(ax,x,flat.occupancy[mu],show_xaxis=show_xaxis,color='b')
    ax.text(0.8,0.8,r'$\mu = %.1f$'%(mu),transform=ax.transAxes,fontsize=18)
    # with HL loop
    ax = axarr[i,1]
    mbt.line_plot(ax,x,HL.occupancy[mu],show_xaxis=show_xaxis,color='r')
    ax.text(0.8,0.8,r'$\mu = %.1f$'%(mu),transform=ax.transAxes,fontsize=18)
            
plt.show()

Now we can perform the ratio of the H to L peaks in the two cases.

In [None]:
flat.H_to_L = np.zeros(len(mus))
HL.H_to_L = np.zeros(len(mus))
for i,mu in enumerate(mus) :
    flat.avH = flat.occupancy[mu][Hsites].mean()
    flat.avL = flat.occupancy[mu][Lsites].mean()
    HL.avH = HL.occupancy[mu][Hsites].mean()
    HL.avL = HL.occupancy[mu][Lsites].mean()
    flat.H_to_L[i] = flat.avH/flat.avL
    HL.H_to_L[i] = HL.avH/HL.avL

In [None]:
plt.plot(mus,flat.H_to_L,'r')
plt.plot(mus,HL.H_to_L,'b')
plt.xlabel(r'$\mu$')
plt.ylabel('H to L ratio')
plt.show()

### HH model
Now let's examine another model: where all the H sites are avid and share contacts between themselves, and leave everyone else thirsty.

In [None]:
# prepare the transition matrix
THH = np.ones((n,n))
nH = len(Hsites)
for i in range(nH) :
    for j in range(i,nH) :
        THH[i,j] += 1
        THH[j,i] += 1
THH = mbt.row_normalize_matrix(THH)

In [None]:
%%time
HH = JumpingModel(THH,site_taus)
for mu in mus :
    print "Mu = %d"%(mu)
    omega_t_initial = pr_peaks.init_omega_t(N,n,mu,sigma)
    HH.run(nsteps,mu,sigma,omega_t_initial)

In [None]:
fig,axarr = plt.subplots(len(mus),2,figsize=(15,5))
x = np.arange(n)
show_xaxis=False
for i,mu in enumerate(mus) :
    # with flat transition probability
    ax = axarr[i,0]
    if i==len(mus) :
        show_xaxis=True
    mbt.line_plot(ax,x,flat.occupancy[mu],show_xaxis=show_xaxis,color='b')
    ax.text(0.8,0.8,r'$\mu = %.1f$'%(mu),transform=ax.transAxes,fontsize=18)
    # with HL loop
    ax = axarr[i,1]
    mbt.line_plot(ax,x,HH.occupancy[mu],show_xaxis=show_xaxis,color='r')
    ax.text(0.8,0.8,r'$\mu = %.1f$'%(mu),transform=ax.transAxes,fontsize=18)
            
plt.show()

In [None]:
HH.H_to_L = np.zeros(len(mus))
for i,mu in enumerate(mus) :
    HH.avH = HH.occupancy[mu][Hsites].mean()
    HH.avL = HH.occupancy[mu][Lsites].mean()
    HH.H_to_L[i] = HH.avH/HH.avL

In [None]:
plt.plot(mus,flat.H_to_L,'r')
plt.plot(mus,HH.H_to_L,'b')
plt.xlabel(r'$\mu$')
plt.ylabel('H to L ratio')
plt.show()

Now the conclusion of all this part is that the various models have certain features that are dependent on the choice of the parameters that one chooses. In this case one can only speculate that there is a certain combination of paramters that leads to the observed non-monotonic behaviour that I observed in the ChIP-seq data.

The next, important step is to look at a simpler system that can be analyzed in terms of fewer parameters that can then be easily related to macroscopic, measurable features in the experimental system.