In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks
import pysam

In [None]:
# load the ChIP-seq data
conditions = [
    pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq'),
    pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq'),
    pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq'),
    pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
]

# 2018-04-06 Histone modification patterns

The non-linear response of the system as a function of the concentration can be explained by a positive feedback loop mediated by histone modifications. The idea is illustrated in the paper by Sneppen in Molecular Systems Biology, 2008. The prediction of this model is quite clear: for the H sites, the pattern of histone modifications in the vicinity of the peaks will be drastically different before and after hormone stimulation. For the L sites, one would not expect much of a difference. Let's try to take the data from the ChIP-seq experiments performed by Guille, and let's look at the various histone modification patterns.

## Loading the data
First, let's look at how to load the data. There are many experiments with many different codes, so I need a convenient data structure to hold them.

In [None]:
class ChIPseq :
    def __init__(self,sample_id) :
        self.sample_id = sample_id
        self.bam_file = mbt.chipseq_bam_location(sample_id)
        # init the pysam parser
        self.bam = pysam.AlignmentFile(self.bam_file)
    def peak_counts(self,peak,extend=None) :
        chromosome,start,end = peak
        if extend is not None :
            start -= extend
            end += extend
        chromosome = str(chromosome)
        # use the BigWig parser to get the stats of the peak
        return self.bam.count(chromosome,start,end)
    def __del__(self) :
        self.bam.close()

In [None]:
experiments = {
    'H3K14ac-T0'  : ChIPseq('gv_037_01_01_chipseq'),
    'H3K14ac-T30' : ChIPseq('gv_038_01_01_chipseq'),
    'H3K27ac-T0'  : ChIPseq('gv_039_01_01_chipseq'),
    'H3K27ac-T30' : ChIPseq('gv_040_01_01_chipseq'),
    'H3K27me3-T0' : ChIPseq('gv_041_01_01_chipseq'),
    'H3K27me3-T30': ChIPseq('gv_042_01_01_chipseq'),
    'H3K36me2-T0' : ChIPseq('gv_043_01_01_chipseq'),
    'H3K36me2-T30': ChIPseq('gv_044_01_01_chipseq'),    
    'H3K4me1-T0'  : ChIPseq('gv_045_01_01_chipseq'),
    'H3K4me1-T30' : ChIPseq('gv_046_01_01_chipseq'),
    'H3K4me3-T0'  : ChIPseq('gv_047_01_01_chipseq'),
    'H3K4me3-T30' : ChIPseq('gv_048_01_01_chipseq'),
    'H3K9me3-T0'  : ChIPseq('gv_049_01_01_chipseq'),
    'H3K9me3-T30' : ChIPseq('gv_050_01_01_chipseq')
}

The data is loaded, so now I need a convenient way of accessing it. A good way is putting the read counts associated to the regions of interest into a table.

In [None]:
# the first thing we'll do is to prepare a table containing the number of counts
# associated to each peak. To do that, we prepare a table that contains all this information.
# The table will look like this: for the H peaks for example
# 
#            H3K27ac               H3K27me3
#          ____________         _____________
#          T0       T30         T0        T30
#   p1
#   p2
#   .
#   .
#   pN
#
# So we need to create a hash table that connects the name of the mark to the column number.

times = ['T0','T30']
marks = ['H3K14ac','H3K27ac','H3K27me3','H3K36me2','H3K4me1','H3K4me3','H3K9me3']
ncols = len(times)*len(marks)

# prepare the hash table for the column numbers for the marks
exp_table = {}
i = 0
for mark in marks :
    for time in times :
        exp_table['%s-%s'%(mark,time)] = i
        i+=1

In [None]:
# now we can create the tables and fill them
peak_counts = {}
for condition in conditions :
    peaks = condition.peaks
    table = np.zeros((peaks.size,len(times)*len(marks)),dtype=np.int32)
    for mark in marks :
        for time in times :
            name = '%s-%s'%(mark,time)
            mbt.log_message('fill peaks','%s/%s/%s'%(condition.name,mark,time))
            experiment = experiments[name]
            j = exp_table[name]
            for i,peak in enumerate(peaks) :
                counts = experiment.peak_counts(peak,extend=10000)
                table[i,j] = counts
    peak_counts[condition.name] = table

Now that the data is loaded into a nice data structure, we can look at how the enrichment around the several regions of the genome change before and after hormone stimulation. The prediction from a model in which histone-modification-mediated positive feedback loops are responsible for the change in the stability of binding of TFs to regions of DNA will predict that around the H peaks there will be a massive increase of acetylation and a depletion of methylation. Around the L peaks nothing significant is expected to happen.

In [None]:
for condition in conditions :
    fig,ax = plt.subplots(1,1,figsize=(len(marks),4))
    mbt.ax_only_y(ax,show_xaxis=True)
    width = 0.25
    indices = np.arange(ncols/2)
    for mark in marks :
        ax.bar(indices,peak_counts[condition.name].mean(axis=0)[0::2],width,color='k')
        ax.bar(indices+width,peak_counts[condition.name].mean(axis=0)[1::2],width,color='lightgray')
    ax.set_xticks(indices+width/2)
    ax.set_xticklabels(marks,rotation=90)
    ax.set_xlabel('Mark',fontsize=24)
    ax.set_ylabel('Read counts',fontsize=24)
    ax.set_title(condition.name,fontsize=32)
    plt.show()

So we have the answer: histone modification-mediated positive feedback loops cannot be held responsible for the change in stability of the trascription factors bound to the sites. If it were the case, then it would mean that the number of reads associated to a particular histone mark would be observed to be significantly different, whereas these plots show that they are not. Therefore, we can conclude that this is not the mechanism that is responsible for the observed effect.