In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks

In [None]:
# load the ChIP-seq data
conditions = [
    pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq'),
    pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq'),
    pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq'),
    pr_peaks.Condition('medium3','3HCP'       ,1.00,'gv_110_01_01_chipseq'),
    pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
]

In [None]:
# prepare the list of chromosomes
chromosomes = ['chr%d'%i for i in xrange(1,23)]
chromosomes.append('chrX')
chromosomes = tuple(chromosomes)

# 2018-04-05 Assessing background

The objective of this study is to assess whether there are significant differences between the background signal in the H peaks versus the L peaks. If there is, then the analysis of the differences between them becomes complicated. If everything is distributed in a nice way, then I won't worry too much about it, and consider that the h-enhancement effect is robust against removing the background.

After discussing with Guillaume, we thought that the most sensible way of doing this analysis, at least as a first step, is to look at how the values of the ChIP reads versus the input reads are distributed on a 2D plot. If everything aligns nicely on a straight line, then there is no need do do complicated analyses, and we are good to go.

I'll use the Zerone output to do the analysis, so that I won't have to worry again about extracting the number of reads corresponding to a given peak from a given input data file.

First thing: parse the Zerone output.

In [None]:
# general variables
pr_peaks_root_dir = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data'%(pr_peaks_root_dir)
# from the previous analysis I saw that there are no significant differences between
# the results of the analysis from the various input data. So I focus only on one.
in_bam = 'T0_total_input'
# parse the Zerone output files for all the conditions and all the inputs
for condition in conditions :
    mbt.log_message('parse_zerone_output','Parsing %s/%s'%(condition.name,in_bam))
    zerone_out = '%s/%s/%s-zerone.out'%(data_dir,in_bam,condition.name)
    condition.zerone = mbt.parse_zerone_output(zerone_out,chromosome_list=chromosomes)

Next, I'll prepare a table that will contain the data that I want to analyse. There will be as many tables as the number of conditions that I have. Each table will contain a number of rows corresponding to the number of peaks associated to that particular conditions (example: 'high' condition will contain 788 rows). The first column of the table will contain the number of reads corresponding to the *input* DNA in the region corresponding to the peak. The rest of the columns will contain the number of reads in the *ChIP* experiments.

In [None]:
peaktable = {}
for condition in conditions :
    # init the peak table
    peaktable[condition.name] = np.zeros((len(condition.peaks),1+len(conditions)))
    # fill the peak table
    for i,peak in enumerate(condition.peaks) :
        for j,chip in enumerate(conditions) :
            zpeak = mbt.find_zerone_peak(chip.zerone[0],chip.zerone[1],peak)
            # calculate the number of control reads only for the first experiment:
            # they are the same for all the experiments
            if j==0 :
                ctrl_reads = sum([r['read_1'] for r in zpeak])
                peaksize = float(zpeak[-1]['end']-zpeak[0]['start'])
                peaktable[condition.name][i,0] = ctrl_reads/peaksize
            chip_reads = sum([r['read_2'] for r in zpeak])
            peaktable[condition.name][i,j+1] = chip_reads/peaksize

In [None]:
colors = ['xkcd:blue',
          'xkcd:purple',
          'xkcd:black',
          'xkcd:pink',
          'xkcd:orange']
x = np.arange(0,0.5,0.1)
y = x.copy()
for j,chip in enumerate(conditions) :
    fig = plt.figure(figsize=(7,7))
    for i,condition in enumerate(conditions) :
        plt.scatter(peaktable[condition.name][:,0],peaktable[condition.name][:,j+1],
                        color=colors[i],label=condition.name)
        plt.plot(x,y,'k--',linewidth=0.75)
    plt.xlabel('Number of reads/bp in Input')
    plt.ylabel('Number of reads/bp in ChIP')
    plt.legend(loc='upper right')
    plt.title('%s nM Progesterone'%chip.concentration,fontsize=32)
    plt.show()

These figures are interesting because we see that the more hormone is added to the system, the more ChIP reads over input reads are produced. The 'low' peaks, for example, have a trend of being constantly under the dashed line (number of ChIP reads equal number of input reads) until the maximum progesterone concentration is reached.

One thing that I can do now is to calculate the slope of these curves, and compare the number across conditions and experiments.

In [None]:
names = [condition.name for condition in conditions]
concentrations = [condition.concentration for condition in conditions]
slopetable = np.zeros((len(conditions),len(conditions)))
for i,name in enumerate(names) :
    for j in xrange(1,len(conditions)+1) :
        ctrl_reads = peaktable[name][:,0]
        chip_reads = peaktable[name][:,j]
        slopetable[i,j-1] = mbt.linear_fit(ctrl_reads,chip_reads)[1]

In [None]:
fig = plt.figure(figsize=(8,6))
for i in xrange(len(conditions)) :
    plt.loglog(concentrations,slopetable[i,:],'o--',label=conditions[i].name)
plt.legend(loc='upper left')
plt.xlabel('Concentration [nM]')
plt.ylabel('ChIP reads/bp per input reads/bp')
plt.show()

Let's save this data for future reference.

In [None]:
for condition in conditions :
    np.save('%s/%s-read_counts.npy'%(data_dir,condition.name),peaktable[condition.name])
np.save('%s/slope_table.npy'%data_dir,slopetable)