In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, sys
import pr_peaks as pp
import mybiotools as mbt
import pysam

# 2019-07-26 New new data
Roser sent me the data of the new round of experiments (P3679). I'm looking here at how these data files look like, after having mapped them (BWA to hg38 genome).

In [None]:
# directories
pp_root = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data/chipseq'%(pp_root)

In [None]:
def chipseq_bam_location (sample_id, datadir) :
    # build the directory name where the files are
    d = "%s/chipseq/samples/%s/alignments"%(xavi_datadir,sample_id)
    # select all files that end with ".bw" in the directory, and
    # then prefer to read the one that is in the directory that has
    # "with_control"
    peakfiles = []
    for root,sub,files in os.walk(d) :
        for f in files :
            if f.endswith (".bam") :
                peakfiles.append('%s/%s'%(root,f))
    fin = None
    for peakfile in peakfiles :
        if 'with_control' in peakfile :
            fin = peakfile
            break
        else :
            fin = peakfile
    if fin is None :
        warn_message('chipseq_bam_location','Data not found for %s'%sample_id)
    return fin

In [None]:
class ChIPseq :
    
    def __init__(self, bamfile) :
        self.bamfile = bamfile
        # init the pysam parser
        self.bam = pysam.AlignmentFile(self.bamfile)
        
    def peak_counts(self, chromosome, start, end, extend=None) :
        if extend is not None :
            start -= extend
            end += extend
        chromosome = str(chromosome)
        return self.bam.count(chromosome,start,end)

In [None]:
# sample table
sample_table_fname = '%s/list.txt'%(data_dir)

# parse it
old_experiments = {}
new_experiments = {}

with open(sample_table_fname, 'r') as f :
    
    # parse file line by line
    for line in f :
        
        # skip lines with comments
        if line.startswith('#') : continue
        sample_fname, sample_name = line.strip().split('\t')
        bamfile = '%s/%s'%(data_dir, sample_fname)
        if 'old' in sample_name : 
            old_experiments[sample_name] = ChIPseq(bamfile)
        if 'new' in sample_name : 
            new_experiments[sample_name] = ChIPseq(bamfile)

With this, we loaded all our experiments in a data structure that will allow for relatively easy access to the information we need. Let's now load the information on the peaks that we will analyze.

In [None]:
high       = pp.Condition('high'  ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium     = pp.Condition('medium','3HCP'       ,0.50,'gv_109_01_01_chipseq')
low        = pp.Condition('low'   ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
Hpeaks = high.peaks
Lpeaks = low.peaks

## Reproducibility of old results

Let's take this by steps. First, let's see at the different concentrations whether we obtain correspondence with a well characterized locus: GREB1.

In [None]:
def get_old_and_new(old_experiments, new_experiments, chromosome, start, end,
                    window_size=300, step_size=150, span=100000) :
    # info about experiments
    Nold = len(old_experiments)
    Nnew = len(new_experiments)
    old_keys = list(old_experiments.keys())
    old_keys.sort()
    new_keys = list(new_experiments.keys())
    new_keys.sort()
    
    # init data
    begin = start - span
    terminate = end + span
    X = np.arange(begin, terminate, step_size)
    N = X.shape[0]
    old = np.zeros((N, Nold))
    new = np.zeros((N, Nnew))
    for i,x in enumerate(X) :
        for j, key in enumerate(old_keys) :
            old[i,j] = old_experiments[key].peak_counts(chromosome,
                                                        x-window_size/2.,
                                                        x+window_size/2.)
        for j, key in enumerate(new_keys) :
            new[i,j] = new_experiments[key].peak_counts(chromosome,
                                                        x-window_size/2.,
                                                        x+window_size/2.)
    
    return X, old, new

In [None]:
def plot_tracks(x, experiments, names, chromosome, color='k') :
    nexperiments = experiments.shape[1]
    fig, axes = plt.subplots(nexperiments, 1, figsize=(10,nexperiments*1.5))
    for i in range(nexperiments) :
        name = names[i]
        ax = axes[i]
        mbt.line_plot(ax, x, experiments[:,i], show_xaxis= i==nexperiments-1, color=color)
        ax.set_ylabel(name, fontsize=12)
    ax.set_xlabel("Genomic coordinate [%s]"%(chromosome))
    return fig, axes

In [None]:
# GREB1 locus
chromosome = 'chr2'
start = 11481675
end = 11642788
X, old, new = get_old_and_new(old_experiments, new_experiments, chromosome, start, end)

In [None]:
fig, axes = plot_tracks(X, old, old_keys, chromosome, color='b')
axes[0].set_title("Old experiments", fontsize=24)
plt.show()

fig, axes = plot_tracks(X, new, new_keys, chromosome, color='r')
axes[0].set_title("New experiments", fontsize=24)
plt.show()

In the famous GREB1 locus, we have that until 0.1nM hormone there is no peak, but then the peak appears.

Let's see at another few random H peaks what happens.

In [None]:
chromosome, start, end = Hpeaks[24]
chromosome = chromosome.decode('utf-8')

X, old, new = get_old_and_new(old_experiments, new_experiments, chromosome, start, end)

fig, axes = plot_tracks(X, old, old_keys, chromosome, color='b')
axes[0].set_title("Old experiments", fontsize=24)
plt.show()

fig, axes = plot_tracks(X, new, new_keys, chromosome, color='r')
axes[0].set_title("New experiments", fontsize=24)
plt.show()

Playing around with random peaks makes me understand that there are peaks that appear and peaks that don't appear. Let's do a quantitative analysis of the correspondence.

In [None]:
chromosomes = [c.decode('utf-8') for c in np.unique(Hpeaks['chr'])]

In [None]:
def make_zerone_output_index(a,chromosome_list=None) :
    # if the chromosome names are not given, get them
    if chromosome_list is None :
        chromosome_list = np.unique(a['chr'])
    # init the dictionary and init the iteration
    c_idx = {}
    c_start = 0
    prev_c = a[0]['chr']
    # we don't sort the input array: Zerone already does this by default
    for i,b in enumerate(a) :
        this_c = b['chr']
        if this_c != prev_c :
            c_end = i-1
            c_idx[prev_c] = (c_start,c_end)
            c_start = i
            prev_c = this_c
    # the last chromosome needs to be manually added
    c_idx[this_c] = (c_start,i)
    return c_idx

def parse_zerone_output(fname,chromosome_list=None) :
    """
    Parses a Zerone output and returns a numpy array. The values of the numpy array
    are: chromosome, start, end, enrichment, read_1, read_2, ..., read_n, p.
    The number of `read_i` columns depends on the invocation of Zerone and cannot
    be known beforehand.
    """
    # first, we start by reading the first non-comment line in the Zerone file, to
    # determine the number of `read` columns in the file
    with open(fname,'r') as f :
        for line in f :
            if not line.startswith('#') :
                break
    n_readcols = len(line.split())-6
    zerone_dtype = [('chr','S256'),
                    ('start',np.int64),
                    ('end',np.int64),
                    ('enrichment',np.int32),
                    ('control',np.int64)]
    for i in range(n_readcols) :
        zerone_dtype.append(('read_%d'%(i),np.int64))
    zerone_dtype.append(('p',float))
    # now we parse the file using the `genfromtxt` function from numpy
    a = np.genfromtxt(fname,dtype=np.dtype(zerone_dtype))
    # next, we exclude the values of the array that pertain to chromosomes that are not
    # included in the chromosome list that was passed by the user (if any)
    if chromosome_list is not None :
        a = np.array([s for s in a if s['chr'] in chromosome_list])
    print(a.shape)
    # now pass the array to the index maker, and return the array along with the index
    c_idx = make_zerone_output_index(a,chromosome_list)
    return a,c_idx

def find_zerone_peak(a,c_idx,chromosome, start, end,bin_size=300) :
    """
    Returns the values of the `a` array corresponding to the genomic coordinates
    of the `peak`. Uses the `c_idx` dictionary to rapidly calculate which are the indices
    of the `a` array that correspond to the peak
    """
    c_start,c_end = c_idx[chromosome]
    peak_idx_start = start//bin_size
    peak_idx_end = end//bin_size
    if peak_idx_start == peak_idx_end :
        return [a[c_start+peak_idx_start]]
    else :
        return a[c_start+peak_idx_start:c_start+peak_idx_end+1]

In [None]:
zerones = {}
for name, experiment in new_experiments.items() :
    mbt.log_message('zerone', name)
    zerone_fname = experiment.bamfile.strip('.bam') + '.zerone.out'
    zerones[name] = parse_zerone_output(zerone_fname)# , chromosome_list=chromosomes)

In [None]:
nHpeaks = len(Hpeaks)
peak_table = np.zeros((Nnew, nHpeaks))
for i, name in enumerate(new_keys) :
    experiments = new_experiments[name]
    for j, peak in enumerate(Hpeaks) :
        chromosome, start, end = peak
        # chromosome = chromosome.decode('utf-8')
        a, c_idx = zerones[name]
        zpeak = mbt.find_zerone_peak(a, c_idx, chromosome, start, end)['enrichment']
        peak_table[i, j] = zpeak.sum()/len(zpeak)