In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks
import zerone
from scipy.stats import gaussian_kde

In [None]:
# load the ChIP-seq data
conditions = [
    pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq'),
    pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq'),
    pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq'),
    pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
]

# 2018-04-13 Zerone analysis of histone modifications
I saw that the number of reads in the histone mark analysis does not differ very much, although I did not do the statistical test properly.

The very last thing I want to look at is whether Zerone agrees with this vision: the enrichment around H peaks and L peaks for the different histone modifications is not significantly different.

The Zerone output files are quite heavy. Therefore, I'll do the analysis on the cluster and load the data here.

In [None]:
experiments = {
    'H3K14ac-T0'  : 'gv_037_01_01_chipseq',
    'H3K14ac-T30' : 'gv_038_01_01_chipseq',
    'H3K27ac-T0'  : 'gv_039_01_01_chipseq',
    'H3K27ac-T30' : 'gv_040_01_01_chipseq',
    'H3K27me3-T0' : 'gv_041_01_01_chipseq',
    'H3K27me3-T30': 'gv_042_01_01_chipseq',
    'H3K36me2-T0' : 'gv_043_01_01_chipseq',
    'H3K36me2-T30': 'gv_044_01_01_chipseq',    
    'H3K4me1-T0'  : 'gv_045_01_01_chipseq',
    'H3K4me1-T30' : 'gv_046_01_01_chipseq',
    'H3K4me3-T0'  : 'gv_047_01_01_chipseq',
    'H3K4me3-T30' : 'gv_048_01_01_chipseq',
    'H3K9me3-T0'  : 'gv_049_01_01_chipseq',
    'H3K9me3-T30' : 'gv_050_01_01_chipseq'
}

In [None]:
times = ['T0','T30']
marks = ['H3K14ac','H3K27ac','H3K27me3','H3K36me2','H3K4me1','H3K4me3','H3K9me3']
ncols = len(times)*len(marks)

# prepare the hash table for the column numbers for the marks
exp_table = {}
i = 0
for mark in marks :
    for time in times :
        exp_table['%s-%s'%(mark,time)] = i
        i+=1

Prepare the output data directory.

In [None]:
pr_peaks_root_dir = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data'%(pr_peaks_root_dir)
inp = 'T0_roberto_input'
chipseq_datadir = '%s/%s'%(data_dir,inp)

Now we are ready to load the columns of the table. The entries of the `zerone_table` will be the values of a relative enrichment of the peak: that is, the fraction of enriched peaks in the region. This will later help me evaluate whether there are significant differences between the enrichment values before and after hormone treatment.

In [None]:
# now we can create the tables and fill them
zerone_table = {}
for condition in conditions :
    peaks = condition.peaks
    table = np.zeros((peaks.size,len(times)*len(marks)))
    peak_id = condition.peak_code
    for mark in marks :
        for time in times :
            # init name of experiment and column number
            name = '%s-%s'%(mark,time)
            j = exp_table[name]
            sample_id = experiments[name]
            # parse Zerone output file
            fname = '%s/%s-%s.npy'%(chipseq_datadir,sample_id,peak_id)
            column = np.load(fname)
            table[:,j] = column
    zerone_table[condition.name] = table

We can now perform an analysis of the results. I'll make a table of gaussian kernels representing the shift in values of relative enrichment. That is, if I have the columns of the `zerone_table` that represent the relative enrichment of each peak, taking the difference between the T30 condition and the T0 condition will produce a dataset of which I can study the distribution. If the distribution is peaked towards positive values, then there is a gain in that particular mark. If it is peaked around zero, nothing happens. If it is peaked around minus one, then there is a loss of that mark.

In [None]:
k_diff_table = {}
for condition in conditions :
    table = zerone_table[condition.name]
    k_dtable = {}
    for j in xrange(table.shape[1]/2) :
        k_dtable[marks[j]] = gaussian_kde(table[:,2*j+1]-table[:,2*j])
    k_diff_table[condition.name] = k_dtable

Now let's represent all the marks.

In [None]:
x = np.arange(-1,1.,0.01)
for j,mark in enumerate(marks) :
    fig = plt.figure()
    for condition in conditions :
        k_dtable = k_diff_table[condition.name]
        plt.plot(x,k_dtable[mark](x),label=condition.name,linewidth=3)
    plt.legend(loc='upper right')
    plt.title(mark,fontsize=32)
    plt.xlabel('Differential',fontsize=24)
    plt.ylabel('Distribution',fontsize=24)