In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pr_peaks
from scipy.stats import gaussian_kde

# 2018-02-28 Hints of good direction

I found that the tetramerization hypothesis might explain the h-enhancement effect. I looked at whther there is any evidence for the effect in the Hi-C maps, but this has proven difficult. Now I want to try to see whether there is any evidence for this by looking at the population of the individual peaks.

First, I'll look at what happens in the theory, when there is contact between two sites and there is a boost in the stability of the complex. What does the population of the sites in contact look like, as a function of the number of searchers?

In [None]:
np.random.seed(85498)

# general simulation parameters
nsteps = 100000
n = 100
boost = 4.0
mus = np.arange(1,20,2)

# init site_taus
Hsites = [2,6,10,40,50,60,70,80]
Lsites = [i for i in xrange(n) if i not in Hsites]
site_taus = 2.0*np.ones(n)
site_taus[Hsites] = 20.0

# init contact lists
nocontacts = [[] for i in xrange(n)]
HHcontact = [[] for i in xrange(n)]
HHcontact[Hsites[0]] = [Hsites[1]]
HHcontact[Hsites[1]] = [Hsites[0]]

# init the Jumping Models
uniform = pr_peaks.JumpingModel(nocontacts,site_taus,boost)
HH = pr_peaks.JumpingModel(HHcontact,site_taus,boost)

# cycle on mu values
for mu in mus :
    # init omega_t
    omega_t_initial = pr_peaks.init_omega_t(n,mu)
    mbt.log_message('Uniform','mu = %d'%(mu))
    uniform.run(nsteps,omega_t_initial)
    mbt.log_message('HH     ','mu = %d'%(mu))
    HH.run(nsteps,omega_t_initial)
    
# aftermath
pr_peaks.H_to_L(uniform,Hsites,Lsites)
pr_peaks.H_to_L(HH,Hsites,Lsites)

Now that we have the data, let's look at the following thing for each of the sites: the average count as a function of the number of searchers.

In [None]:
nmus = len(mus)
nH   = len(Hsites)
nL   = len(Lsites)
for model in [uniform,HH] :
    model.Hsites_theta = np.zeros((nmus,nH))
    model.H_to_L_individual = np.zeros((nmus,nH))
    for i,mu in enumerate(mus) :
        model.Hsites_theta[i,:] = model.theta[mu][Hsites]
        averageL = model.theta[mu][Lsites].mean()
        model.H_to_L_individual[i,:] = model.Hsites_theta[i,:]/averageL

In [None]:
fig,axarr = plt.subplots(2,2,figsize=(10,10))
for j in xrange(nH) :
    if j==0 or j==1 :
        color='r'
    else :
        color='k'
    axarr[0,0].plot(mus,uniform.Hsites_theta[:,j],color=color)
    axarr[0,1].plot(mus,HH.Hsites_theta[:,j],color=color)
    axarr[1,0].plot(mus,uniform.H_to_L_individual[:,j],color=color)
    axarr[1,1].plot(mus,HH.H_to_L_individual[:,j],color=color)
axarr[0,0].set_title('No contacts',fontsize=18)
axarr[0,1].set_title('With contacts',fontsize=18)
for ax in axarr[0,:] :
    ax.set_ylabel(r'$\theta$')
for ax in axarr[1,:] :
    ax.set_ylabel(r'H to L ratio')
    ax.set_xlabel(r'$\mu$')
plt.show()

So the difference here is quite spectacular. The population of the sites that have contacts is growing with the number of searchers, but all the other sites have a uniform decrease. Does this happen also for our beloved data?

In [None]:
# load the data
high       = pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
medium3    = pr_peaks.Condition('medium3','3HCP'       ,1.00,'gv_110_01_01_chipseq')
low        = pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
# now calculate the number of counts corresponding to the High peaks, 
# as a function of the concentration of hormone
Hpeaks = high.peaks
Lpeaks = low.peaks
conditions = [high,medium1,medium2,medium3,low]
nconditions = len(conditions)
nHpeaks = Hpeaks.size

# init the arrays
Hpeaks_count = np.zeros((nHpeaks,nconditions))
averageL = np.zeros(nconditions)
H_to_L_individual = np.zeros((nHpeaks,nconditions))

# fill the arrays
for j,condition in enumerate(conditions) :
    averageL[j] = pr_peaks.average_peak_counts(Lpeaks,condition)
    for i,peak in enumerate(Hpeaks) :
        Hpeaks_count[i,j] = condition.peak_counts(peak)
        H_to_L_individual[i,j] = Hpeaks_count[i,j]/averageL[j]kj

In [None]:
concentrations = np.array([c.concentration for c in conditions])
for j in xrange(100) :
    plt.loglog(concentrations,H_to_L_individual[j,:])#/H_to_L_individual[j,0])
plt.xlabel('Concentration [nM]')
plt.ylabel('Normalized H to L ratio')
plt.show()

From this data it is clear that there are many (if not all) peaks that increase their individual H to L ratio. It is not clear whether there are any of these peaks that do not. I'll try to look at the correlations between the individual peaks.

In [None]:
corrmat = np.zeros((nHpeaks,nHpeaks))
for i in xrange(nHpeaks) :
    for j in xrange(i,nHpeaks) :
        c = np.corrcoef(H_to_L_individual[i,:],H_to_L_individual[j,:])[0,1]
        corrmat[i,j] = corrmat[j,i] = c

In [None]:
start,end = 50,60
fig,ax = plt.subplots(1,1,figsize=(10,10))
cb = ax.matshow(corrmat[start:end,start:end])
cax = plt.colorbar(cb)
yticks = range(start,end)
ax.set_yticks(range(len(yticks)))
ax.set_yticklabels([str(Hpeaks[i]) for i in yticks])
plt.show()

This pattern is not easily understandable. Maybe has something to do with TADs? What about inter-chromosomal correlations?

I'll try to look at the various values of the concentrations and see the distributions of the H to L ratios.

In [None]:
kH_to_L = []
for i in xrange(concentrations.size) :
    kH_to_L.append(gaussian_kde(H_to_L_individual[:,i]))

In [None]:
x = np.arange(1,50,1)
colors = ['b','r','g','k','xkcd:light blue']
for i in xrange(concentrations.size) :
    plt.plot(x,kH_to_L[i](x),label='c = %.2f'%(concentrations[i]),linewidth=3,
            color=colors[i])
plt.xlabel('Number of reads in H peaks')
plt.ylabel('Distribution')
plt.legend(loc='upper right')
plt.show()

I'll look at the most anticorrelated pair of data points.

In [None]:
i,j = np.unravel_index(corrmat.argmin(),corrmat.shape)
plt.semilogx(concentrations,H_to_L_individual[i,:],'o--')
plt.semilogx(concentrations,H_to_L_individual[j,:],'o--')
plt.xlabel('Concentration [nM]')
plt.ylabel('H to L ratio')
plt.show()

Now I plot the profile that attains the highest value of the H to L ratio.

In [None]:
i,j = np.unravel_index(H_to_L_individual.argmax(),H_to_L_individual.shape)
plt.semilogx(concentrations,H_to_L_individual[i,:],'o--')
plt.xlabel('Concentration [nM]')
plt.ylabel('H to L ratio')
plt.show()