In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, sys
import pr_peaks as pp
import mybiotools as mbt
import pysam

# 2019-07-26 New new data
Roser sent me the data of the new round of experiments (P3679). I'm looking here at how these data files look like, after having mapped them (BWA to hg38 genome).

In [None]:
# directories
pp_root = '%s/work/CRG/projects/pr_peaks'%(os.getenv('HOME'))
data_dir = '%s/data/chipseq'%(pp_root)

In [None]:
def chipseq_bam_location (sample_id, datadir) :
    # build the directory name where the files are
    d = "%s/chipseq/samples/%s/alignments"%(xavi_datadir,sample_id)
    # select all files that end with ".bw" in the directory, and
    # then prefer to read the one that is in the directory that has
    # "with_control"
    peakfiles = []
    for root,sub,files in os.walk(d) :
        for f in files :
            if f.endswith (".bam") :
                peakfiles.append('%s/%s'%(root,f))
    fin = None
    for peakfile in peakfiles :
        if 'with_control' in peakfile :
            fin = peakfile
            break
        else :
            fin = peakfile
    if fin is None :
        warn_message('chipseq_bam_location','Data not found for %s'%sample_id)
    return fin

In [None]:
class ChIPseq :
    
    def __init__(self, bamfile) :
        self.bamfile = bamfile
        # init the pysam parser
        self.bam = pysam.AlignmentFile(self.bamfile)
        
    def peak_counts(self, chromosome, start, end, extend=None) :
        if extend is not None :
            start -= extend
            end += extend
        chromosome = str(chromosome)
        return self.bam.count(chromosome,start,end)

In [None]:
# sample table
sample_table_fname = '%s/list.txt'%(data_dir)

# parse it
experiments = {}
with open(sample_table_fname, 'r') as f :
    for line in f :
        sample_fname, sample_name = line.strip().split('\t')
        bamfile = '%s/%s'%(data_dir, sample_fname)
        experiments[sample_name] = ChIPseq(bamfile)

With this, we loaded all our experiments in a data structure that will allow for relatively easy access to the information we need. Let's now load the information on the peaks that we will analyze.

In [None]:
high       = pp.Condition('high'  ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium     = pp.Condition('medium','3HCP'       ,0.50,'gv_109_01_01_chipseq')
low        = pp.Condition('low'   ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
Hpeaks = high.peaks
Lpeaks = low.peaks

## Reproducibility of old results

Let's take this by steps. Let's figure out whether the peaks at 0.05nM coincide in this new sample.

In [None]:
old = experiments['0.05old']
new1 = experiments['0.05new1']
new2 = experiments['0.05new2']

chromosome = 'chr2'
begin = 11470000
terminate = begin + 300000
window_size = 300
step_size = 150
X = np.arange(begin, terminate, step_size)
Y = np.zeros((X.shape[0], 3))
for i,x in enumerate(X) :
    Y[i,0] = old.peak_counts(chromosome, x-window_size/2., x+window_size/2.)
    Y[i,1] = new1.peak_counts(chromosome, x-window_size/2., x+window_size/2.)
    Y[i,2] = new2.peak_counts(chromosome, x-window_size/2., x+window_size/2.)

In [None]:
fig, axes = plt.subplots(3,1,figsize=(10,6))

ax = axes[0]
mbt.line_plot(ax, X, Y[:,0], show_xaxis=False)
ax.set_ylabel("Old")

ax = axes[1]
mbt.line_plot(ax, X, Y[:,1], show_xaxis=False)
ax.set_ylabel("New1")

ax = axes[2]
mbt.line_plot(ax, X, Y[:,2], show_xaxis=True)
ax.set_xlabel("Genomic coordinate [%s]"%(chromosome))
ax.set_ylabel("New2")

plt.show()

In the famous GREB1 locus, we do not have correspondence. I quit here waiting for help from the bioinformaticians.