In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import pysam
import os

# 2018-02-19 Number of reads analysis
I want to perform the same analysis as before, but simply looking at the number of reads corresponding to each of the peaks.

The pieces of code below are just copy/pasted from my previous notebook.

In [None]:
def chipseq_bam_location (sample_id,xavi_datadir='/mnt/xavi/data') :
    # build the directory name where the files are
    d = "%s/chipseq/samples/%s/alignments"%(xavi_datadir,sample_id)
    # select all files that end with ".bw" in the directory, and
    # then prefer to read the one that is in the directory that has
    # "with_control"
    peakfiles = []
    for root,sub,files in os.walk(d) :
        for f in files :
            if f.endswith (".bam") :
                peakfiles.append('%s/%s'%(root,f))
    fin = None
    for peakfile in peakfiles :
        if 'with_control' in peakfile :
            fin = peakfile
            break
        else :
            fin = peakfile
    if fin is None :
        warn_message('chipseq_bam_location','Data not found for %s'%sample_id)
    return fin

In [None]:
class Condition :
    def __init__(self,name,peak_code,concentration,sample_id) :
        self.name = name
        self.peak_code = peak_code
        self.concentration = concentration
        self.sample_id = sample_id
        # load the peaks
        self.peaks = mbt.load_hcp_peaks(self.peak_code)
        # init the BAM file
        self.bam_file = chipseq_bam_location(sample_id)
        # init the pysam parser
        self.bam = pysam.AlignmentFile(self.bam_file)
    def peak_counts(self,peak) :
        chromosome,start,end = peak
        chromosome = str(chromosome)
        # use the BigWig parser to get the stats of the peak
        return self.bam.count(chromosome,start,end)
    def __del__(self) :
        self.bam.close()

In [None]:
def average_peak_counts(peaks,condition) :
    npeaks = peaks.size
    pcounts = np.zeros(npeaks)
    for i,peak in enumerate(peaks) :
        pcounts[i] = condition.peak_counts(peak)
    return pcounts.mean()

In [None]:
# use my lovely "Condition" class to get my data nicely packed into convenient data structures
high       = Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq')
medium1    = Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq')
medium2    = Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq')
low        = Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')

In [None]:
high.avH  = average_peak_counts(high.peaks,high)
high.avM1 = average_peak_counts(medium1.peaks,high)
high.avM2 = average_peak_counts(medium2.peaks,high)
high.avL  = average_peak_counts(low.peaks,high)

In [None]:
medium1.avH  = average_peak_counts(high.peaks,medium1)
medium1.avM1 = average_peak_counts(medium1.peaks,medium1)
medium1.avM2 = average_peak_counts(medium2.peaks,medium1)
medium1.avL  = average_peak_counts(low.peaks,medium1)

In [None]:
medium2.avH  = average_peak_counts(high.peaks,medium2)
medium2.avM1 = average_peak_counts(medium1.peaks,medium2)
medium2.avM2 = average_peak_counts(medium2.peaks,medium2)
medium2.avL  = average_peak_counts(low.peaks,medium2)

In [None]:
low.avH  = average_peak_counts(high.peaks,low)
low.avM1 = average_peak_counts(medium1.peaks,low)
low.avM2 = average_peak_counts(medium2.peaks,low)
low.avL  = average_peak_counts(low.peaks,low)

In [None]:
print "           avH     avM1    avM2     avL"
print "High    :   %.3f    %.3f    %.3f     %.3f"%(high.avH,high.avM1,high.avM2,high.avL)
print "Medium1 :   %.3f    %.3f    %.3f     %.3f"%(medium1.avH,medium1.avM1,medium1.avM2,medium1.avL)
print "Medium2 :   %.3f    %.3f    %.3f     %.3f"%(medium2.avH,medium2.avM1,medium2.avM2,medium2.avL)
print "Low     :   %.3f    %.3f    %.3f     %.3f"%(low.avH,low.avM1,low.avM2,low.avL)

In [None]:
conditions = [high,medium1,medium2,low]

In [None]:
nconditions = len(conditions)
H_to_L = np.zeros(nconditions)
M1_to_L = np.zeros(nconditions)
M2_to_L = np.zeros(nconditions)
for i,condition in enumerate(conditions) :
    H_to_L[i] = condition.avH/condition.avL
    M1_to_L[i] = condition.avM1/condition.avL
    M2_to_L[i] = condition.avM2/condition.avL

In [None]:
concentrations = [condition.concentration for condition in conditions]
plt.loglog(concentrations,H_to_L,label='High')
plt.loglog(concentrations,M1_to_L,label='Medium1')
plt.loglog(concentrations,M2_to_L,label='Medium2')
plt.legend(loc='upper right')
plt.show()

The results of this analysis, by using the ratio between the number of reads instead of the peak quality, shows that there is an interesting non-monotonic dependence of the ratio on the concentration.

Let's think for a moment that this effect is real. I want to look at possible reasons why this can be the case. I'll turn back to the chair model and look at non-uniform transition matrices.