In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks
from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles

# 2018-03-08 Comparing Zerone outputs
Here I want to see what happens if I compare the outputs of Zerone, when they are given the same ChIP-seq data to process, but with different inputs.

There are three input files:

- `T0_roberto_input`: is the one that François told me to use
- `T0_total_input`: is the one that Roberto uses for the T0 condition
- `R1h_total_input`: is the one that Roberto uses for the R1h condition (one hour after hormone stimulation)

In the ChIP-seq experiments of Guille, the data is collected at 30 minutes after hormone stimulation, so in principle it is not super-clear which input should be used.

Let's start by loading the Zerone outputs of the `high` experiment, using the three different input files.

In [None]:
# prepare the list of chromosomes
chromosomes = ['chr%d'%i for i in xrange(1,23)]
chromosomes.append('chrX')
chromosomes = tuple(chromosomes)

In [None]:
# load the ChIP-seq data
conditions = [
    pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq'),
    pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq'),
    pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq'),
    pr_peaks.Condition('medium3','3HCP'       ,1.00,'gv_110_01_01_chipseq'),
    pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
]

In [None]:
# general variables
pr_peaks_root_dir = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data'%(pr_peaks_root_dir)
in_bams = ['T0_roberto_input','T0_total_input','R1h_total_input']

In [None]:
# parse the Zerone output files for all the conditions and all the inputs
for condition in conditions :
    condition.zerone = {}
    for in_bam in in_bams :
        mbt.log_message('parse_zerone_output','Parsing %s/%s'%(condition.name,in_bam))
        zerone_out = '%s/%s/%s-zerone.out'%(data_dir,in_bam,condition.name)
        condition.zerone[in_bam] = mbt.parse_zerone_output(zerone_out,chromosome_list=chromosomes)

I can do a simple thing: calculate the correlation between the various samples. Let's start with the controls: I can choose any of the conditions, because the control values are the same.

In [None]:
n_in_bam = len(in_bams)
controls_corrmat = np.zeros((n_in_bam,n_in_bam))
high = conditions[0]
for i,in_bam_1 in enumerate(in_bams) :
    for j,in_bam_2 in enumerate(in_bams) :
        controls_corrmat[i,j] = np.corrcoef(high.zerone[in_bam_1][0]['read_1'],
                                            high.zerone[in_bam_2][0]['read_1'])[0,1]

In [None]:
# plot the correlation matrix
cax = plt.matshow(controls_corrmat)
plt.colorbar(cax)
plt.show()

So the controls are very correlated between each other. Let's have a look at the values of the p values of the various Zerone outputs, for each condition.

In [None]:
for condition in conditions :
    condition.zerone_corrmat = np.zeros((n_in_bam,n_in_bam))
    for i,in_bam_1 in enumerate(in_bams) :
        for j,in_bam_2 in enumerate(in_bams) :
            condition.zerone_corrmat[i,j] = np.corrcoef(condition.zerone[in_bam_1][0]['p'],
                                                        condition.zerone[in_bam_2][0]['p'])[0,1]

In [None]:
for condition in conditions :
    fig = plt.figure()
    cax = plt.matshow(condition.zerone_corrmat)
    plt.colorbar(cax)
    plt.title(condition.name)
    plt.show()

Let's also have a visual look at a particular region of one of the controls.

In [None]:
x1 = 10000
x2 = 11000
x = np.arange(x1,x2)
for in_bam in in_bams :
    plt.plot(x,conditions[0].zerone[in_bam][0]['read_1'][x1:x2],label=in_bam)
plt.legend()
plt.show()

This is further evidence that the number of reads in the controls is not dramatically different between the various "input" samples. Also, the difference between the control condition at T0 and at R1h is not spectacular.

Now let's see whether using the different controls there are any major differences between the peak calling. I'll do this by making a table that is structured as follows: the rows correspond to each peak in a given condition, and the columns represent whether Zerone called that a peak or not, in the three different input conditions.

In [None]:
for condition in conditions :
    npeaks = condition.peaks.size
    condition.peak_table = np.zeros((npeaks,n_in_bam),dtype=bool)
    for i,peak in enumerate(condition.peaks):
        for j,in_bam in enumerate(in_bams) :
            p = mbt.find_zerone_peak(condition.zerone[in_bam][0],
                                     condition.zerone[in_bam][1],
                                     peak)
            if not (p['enrichment']==0).all() :
                condition.peak_table[i,j] = True

In [None]:
for condition in conditions :
    set1 = set(np.where(condition.peak_table[:,0])[0].tolist())
    set2 = set(np.where(condition.peak_table[:,1])[0].tolist())
    set3 = set(np.where(condition.peak_table[:,2])[0].tolist())
    fig = plt.figure()
    venn3([set1,set2,set3],(in_bams[0],in_bams[1],in_bams[2]))
    plt.title(condition.name)
    plt.show()

Okay so this shows that the peaks called by Zerone are not that different from the ones called by Xavi, and the three different inputs do not give very different results.