In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import scipy
import os, sys
from Bio import SeqIO, Seq
from Bio.Alphabet import IUPAC
from Bio.motifs.matrix import PositionWeightMatrix

# PR titration
Guille and Roser performed experiments with different concentrations of R5020 given to the T47D cell line, and observed interesting effects. At the smallest concentrations of the progestin, about ~750 peaks of PR binding appear in the ChIP-seq profiles. These peaks are conserved in all successive experiments done at higher hormone concentrations.

It would be interesting to be able to **predict** that those are the peaks that should appear first when adding small amounts of hormone. The idea could be that the 3D structure of the genome somehow guides the search process, so that those first sites are consistently found first when adding small amounts of hormone.

To study all of this process, the first thing I should do is load the data. Let's start with loading the ChIP-seq track of the PR at the lowest concentration that they looked at.

In [None]:
def parse_simple_bed (fname) :
    """
    A parser for a BED file that contains only the indication of chromosome,
    region start and region end.
    """
    simple_bed_dtype = np.dtype([
                                ('chr','S10'),
                                ('start',np.int64),
                                ('end',np.int64)
                               ])
    return np.genfromtxt (fname,dtype=simple_bed_dtype)

In [None]:
def load_peaks (peaks_id,xavi_datadir='/mnt/xavi') :
    datadir = '%s/projects/gvicent/analysis/2017-01-23_characterisation_prbs_r5020_titration/tables'%(xavi_datadir)
    datafile = '%s/genomic_coordinates_by_peak_population_%s.bed'%(datadir,peaks_id)
    return parse_simple_bed(datafile)

In [None]:
peak_types = ['all_treated','4HCP','3HCP','1HCP']
peaks = {}
for peak_type in peak_types :
    peaks[peak_type] = load_peaks(peak_type)

Now the data is loaded, let's look a little bit at whether we can do some simple stuff using the Hi-C data for the same cell line and at the time before treatment with hormone.

In [None]:
hic_sample_id = 'dc3a1e069_467f847a2'
resolution = 50000
hic_file = mbt.hic_location(hic_sample_id,resolution,datatype_string='normalized')
if os.path.exists(hic_file) : print hic_file

In [None]:
hic = mbt.parse_hic(hic_file)

Let's start by drawing some stuff.

In [None]:
class Chromosome :
    def __init__ (self,name) :
        self.name = name

In [None]:
def counts_to_hic (counts,start,end,resolution) :
    """
    Returns a complete filled matrix given the 'counts' array, by taking
    for granted that the counts correspond to a given chromosome.
    """
    N = (end-start)/resolution + 1
    H = np.zeros((N,N),dtype=counts['val'].dtype)
    for h in counts :
        i = (h['i']-start)/resolution
        j = (h['j']-start)/resolution
        H[i,j] = H[j,i] = h['val']
    return H

In [None]:
chromosome_name = 'chr18'
chromosome = Chromosome (chromosome_name)
chromosome_hic = np.array([h for h in hic if h['chr']==chromosome_name])
N = max(chromosome_hic['i'].max(),chromosome_hic['j'].max())
n = N/resolution
chromosome.H = counts_to_hic(chromosome_hic,0,N,resolution)

In [None]:
# I'll plot the "all_treated" peaks first
chromosome.peaks = {}
peak_type = 'all_treated'
chromosome.peaks[peak_type] = np.array([p for p in peaks[peak_type] if p['chr']==chromosome_name])
x = np.arange(n)
y = np.zeros(n)
for p in chromosome.peaks[peak_type] :
    i = p['start']/resolution
    y[i] = 1.0

In [None]:
fig = plt.figure(figsize=(10,12))
gs = plt.GridSpec(2,1,hspace=0,height_ratios=[10,2])
ax = plt.subplot(gs[0,0])
ax.matshow(1-np.log(chromosome.H),cmap=plt.cm.Greens)
ax = plt.subplot(gs[1,0],sharex=ax)
mbt.line_plot(ax,x,y)

Okay let's do it for all the chromosomes.

In [None]:
chromosomes = ['chr%d'%d for d in range(1,23)]

In [None]:
peak_type = 'all_treated'
for chromosome_name in chromosomes :
    chromosome = Chromosome (chromosome_name)
    chromosome_hic = np.array([h for h in hic if h['chr']==chromosome_name])
    N = max(chromosome_hic['i'].max(),chromosome_hic['j'].max())
    n = N/resolution
    chromosome.H = counts_to_hic(chromosome_hic,0,N,resolution)
    chromosome.peaks = {}
    chromosome.peaks[peak_type] = np.array([p for p in peaks[peak_type] if p['chr']==chromosome_name])
    x = np.arange(n)
    y = np.zeros(n)
    for p in chromosome.peaks[peak_type] :
        i = p['start']/resolution
        y[i] = 1.0
    fig = plt.figure(figsize=(10,12))
    gs = plt.GridSpec(2,1,hspace=0,height_ratios=[10,2])
    ax = plt.subplot(gs[0,0])
    ax.matshow(1-np.log(chromosome.H),cmap=plt.cm.Greens)
    ax = plt.subplot(gs[1,0],sharex=ax)
    mbt.line_plot(ax,x,y)
    fig.savefig('/home/rcortini/work/CRG/code/notes/human/2017-09-19-PR_titration/%s.png'%chromosome_name)

By visually looking at the patterns, I don't see anything. Let's look at the sequence.

In [None]:
hg19_genome_file = os.getenv('HOME') + '/work/data/GRCh37.fasta'
h19 = SeqIO.index (hg19_genome_file,'fasta',alphabet=IUPAC.unambiguous_dna)

In [None]:
# load the PR binding motif matrix so that Bio understands it
M = np.genfromtxt(os.getenv('HOME')+'/work/data/motif231.motif',comments='>')
Mdict = {}
for i,letter in enumerate(['A','C','G','T']) :
    Mdict[letter] = M[:,i]
pwm = PositionWeightMatrix(IUPAC.unambiguous_dna,Mdict)
pssm = pwm.log_odds()
motif_length = len(pwm['A'])

In [None]:
def get_max_seq_score(genome,peaks,pssm) :
    max_scores = []
    for peak in peaks :
        seq = genome[peak['chr']].seq[peak['start']:peak['end']]
        try :
            scores_f = pssm.calculate(seq)
            scores_b = pssm.reverse_complement().calculate(seq)
            max_scores.append(max(scores_f.max(),scores_b.max()))
        except MemoryError :
            print seq, peak
    return np.array(max_scores)

In [None]:
max_scores = {}
for peak_type in peak_types :
    mbt.log_message('get_max_seq_score','peak_type = %s'%peak_type)
    max_scores[peak_type] = get_max_seq_score(h19,peaks[peak_type],pssm)

In [None]:
x = np.arange(-1.,15.,0.01)
k_max_scores = {}
for peak_type in peak_types :
    k_max_scores[peak_type] =\
    scipy.stats.gaussian_kde(max_scores[peak_type][~np.isnan(max_scores[peak_type])])
    plt.plot(x,k_max_scores[peak_type](x),label=peak_type)
plt.legend(loc='upper left')
plt.xlabel("Site affinity",fontsize=24)
plt.ylabel("Distribution",fontsize=24)
plt.show()

The conclusion from this quick and dirty analysis is that the sites that are occupied at all the R5020 concentrations have a (maximum) binding affinity that is greater than that of other sequences that are bound at higher R5020 concentrations.

# A toy model

I want to look at a simple model in which I put in the following ingredients:

- a fixed number of binding sites, each with its binding affinity given, and fixed
- the binding affinity is in a one-to-one relationship to the time that the protein spends on it, by an exponential function
- I'll look at what happens to the equilibrium distribution as a function of the number of searching proteins.
- proteins can be either diffusing or bound

In [None]:
def affinity_to_time (affinity) :
    return np.exp(affinity)

In [None]:
class Searcher :
    def __init__(self,index) :
        self.index = index

In [None]:
# number of sites in the system
N = 50
# init sites
site_times = []
affinity_mu = 1.0
affinity_sigma = 0.2
np.random.seed(934204)
for i in xrange(N) :
    affinity = np.random.normal(loc=affinity_mu,scale=affinity_sigma)
    # affinity = affinity_mu
    site_time = affinity_to_time (affinity)
    site_times.append (site_time)
site_times = np.asarray(site_times)

In [None]:
# number of searchers
M = 10
# the initial occupancy vector
np.random.seed(9342)
initial_occupied_sites = np.random.choice(N,M,replace=False)
# init searchers
searchers = []
for i in xrange(M) :
    searcher = Searcher(i)
    # assign initial site to the searcher
    searcher.site = initial_occupied_sites[i]
    # assign the detach time to the searcher
    searcher.td = np.random.exponential(scale=site_times[searcher.site])
    searchers.append(searcher)
    # print searcher.td

In [None]:
# total simulation time
T = 100000
# init occupancy
occupancy = np.zeros(N,dtype=np.int32)
occupancy_t = np.zeros((T,N),dtype=np.int32)
# init occupied sites
free_sites = np.ones(N,dtype=bool)
free_sites[initial_occupied_sites] = False
np.random.seed(934)
# main simulation loop
for t in xrange(T) :
    occupancy_t[t,:] = free_sites.copy()
    for searcher in searchers :
        # update occupancy vector
        occupancy[searcher.site] += 1
        # check if this searcher has detached
        if t<searcher.td : continue
        # if not, choose a new site
        found = False
        while not found :
            # this line chooses a new site randomly from the sites in
            # the system
            new_site = np.random.randint(0,N)
            # check if new site is free
            if free_sites[new_site] :
                # if the chosen new site is free, then update the
                # "free_sites" vector and the searcher.site
                free_sites[new_site] = False
                free_sites[searcher.site] = True
                searcher.site = new_site
                # assign the detach time to the searcher
                searcher.td = t + np.random.exponential(scale=site_times[searcher.site])
                found = True

In [None]:
fig = plt.figure(figsize=(4,14))
gs = plt.GridSpec(3,1,height_ratios=[4,1,1],hspace=0)
ax = plt.subplot(gs[0,0])
ax.matshow(occupancy_t[1100:1300,:])
ax = plt.subplot(gs[1,0],sharex=ax)
x = np.arange(N)
mbt.line_plot(ax,x,site_times,color='b')
ax.set_ylabel('Times')
ax = plt.subplot(gs[2,0],sharex=ax)
mbt.line_plot(ax,x,occupancy,color='r')
ax.set_ylabel('Occupancy')

In [None]:
fig = plt.figure(figsize=(10,4))
ax = plt.subplot(111)
x = np.arange(N)
tav = site_times.mean()
ax.plot(x,occupancy/(M*float(T)),color='b',label='M = 10')
ax.plot(x,site_times/(N*tav),color='r',label='M = 1')
ax.legend(loc='upper left')
ax.axhline(y=1.0/N,linestyle='--',color='k',linewidth=0.75)
ax.set_xlabel('Site index',fontsize=24)
ax.set_ylabel('Occupancy',fontsize=24)
plt.show()

Okay so we have the first results of playing with this toy model. The results show that the high affinity sites become progressively depleted of occupancy, whereas the sites that have low affinity are progressively more occupied. The sites that do not change their occupancy are the ones that have their average residence time that corresponds to the **average residence time**. Therefore, this gives a possible avenue to estimate this quantity from real data. However, there is the big caveat that the enrichment/site score is perhaps not a good proxy for the residence time. Before tackling this complicated thing, I will encapsulate the simulation routine in a convenient function, so that I can test different cases easily.

In [None]:
def TEV_model_simulate (T,N,M,site_times) :
    np.random.seed(304998)
    # init initial occupied sites
    sites = np.random.choice(N,M,replace=False)
    # init detach times vector
    td = np.zeros(M)
    for j in xrange(M) :
        td[j] = np.random.exponential(scale=site_times[sites[j]])
    # init initial occupancy
    occupancy_t = np.zeros((T,N),dtype=bool)
    occupancy_t[0,sites] = True
    # main simulation loop
    for t in xrange(1,T) :
        occupancy_t[t,:] = occupancy_t[t-1,:].copy()
        # print t,occupancy_t[t,:]
        for j in xrange(M) :
            # check if this searcher has detached
            if t<td[j] : continue
            # if not, choose a new site
            found = False
            while not found :
                # this line chooses a new site randomly from the sites in
                # the system
                new_site = np.random.randint(0,N)
                # check if new site is free
                if not occupancy_t[t,new_site] :
                    # if the chosen new site is free, then update the
                    # "free_sites" vector and the searcher.site
                    occupancy_t[t,new_site] = True
                    occupancy_t[t,sites[j]] = False
                    sites[j] = new_site
                    # assign the detach time to the searcher
                    td[j] = t + np.random.exponential(scale=site_times[new_site])
                    found = True
    return occupancy_t

In [None]:
N = 50
# init sites
site_times = []
affinity_mu = 1.0
affinity_sigma = 0.2
for i in xrange(N) :
    affinity = np.random.normal(loc=affinity_mu,scale=affinity_sigma)
    # affinity = affinity_mu
    site_time = affinity_to_time (affinity)
    site_times.append (site_time)
site_times = np.asarray(site_times)
M = np.array([1,10,25,40])
nM = len(M)
T = 100000
occupancy_m = np.zeros((nM,N),dtype=np.int32)
for i,m in enumerate(M) :
    occupancy_t = TEV_model_simulate(T,N,m,site_times)
    occupancy_m[i,:] = occupancy_t.sum(axis=0)

In [None]:
imin = site_times.argmin()
imax = site_times.argmax()
imean = np.argmin(np.abs(tav-site_times))
plt.plot(M,occupancy_m[:,imin]/(M*float(T)),label='Low affinity')
plt.plot(M,occupancy_m[:,imax]/(M*float(T)),label='High affinity')
plt.plot(M,occupancy_m[:,imean]/(M*float(T)),label='Mean affinity')
plt.legend(loc='upper right')
plt.xlabel('Number of searchers',fontsize=24)
plt.ylabel('Occupancy',fontsize=24)
plt.show()

These results are quite clear. It's interesting to see how the gradient of the occupancy as a function of the concentration might help discern whether the site has an average residence time which is higher or lower than the mean residence time. Now the challenge is to figure out whether this model may explain anything about the data from Miguel's lab.

# Back to the data

Here I want to try and figure out a way of looking at whether the simple results from our basic model may explain any feature of the data on the PR titration experiments. The first thing is to load the raw ChIP-seq tracks, look at how the data on that peaks varies as a function of the concentration of PR.

In [None]:
# prepare data
concentrations = [0.05,0.10,0.50,1.0,10.0]
track_ids = {}
track_ids[0.05] = 'gv_107_01_01_chipseq'
track_ids[0.10] = 'gv_108_01_01_chipseq'
track_ids[0.50] = 'gv_109_01_01_chipseq'
track_ids[1.0] = 'gv_110_01_01_chipseq'
track_ids[10.0] = 'gv_111_01_01_chipseq'

In [None]:
# load all ChIP-seq tracks
tracks = {}
for concentration in concentrations :
    f = mbt.track_location('chipseq',track_ids[concentration])
    tracks[concentration] = mbt.parse_narrowpeak(f)

Now we need to do the following: analyze how the ChIP-seq peaks' enrichment and/or q values vary as a function of the concentration. I'll start by picking one peak and writing the code to fetch this information from the tracks.

In [None]:
# I pick one peak randomly
peak = peaks['all_treated'][3]
print peak

In [None]:
# now I need to find this peak in the loaded tracks
track = tracks[0.50]
# first I select the peaks corresponding to the peak's chromosome
track_chr = np.array([p for p in track if p['chr']==peak['chr']])
# then I need to see for each peak if there is an overlap between the peak's coordinates
# and the one from the ChIP-seq track
for p in track_chr :
    # a tiny algorithm to test whether there is an overlap
    d = max(p['end'],peak['end'])-min(p['start'],peak['start'])
    D = (peak['end']-peak['start']) + (p['end']-p['start'])
    if d<=D :
        overlap = True
        print p

Okay this works. So let's look at the data for this peak.

In [None]:
def find_peak (track,peak) :
    # select the peaks that correspond to the correct chromosome
    track_chr = np.array([p for p in track if p['chr']==peak['chr']])
    # look for the peak in the track
    for p in track_chr :
        # a tiny algorithm to test whether there is an overlap
        d = max(p['end'],peak['end'])-min(p['start'],peak['start'])
        D = (peak['end']-peak['start']) + (p['end']-p['start'])
        if d<=D :
            overlap = True
            return p

In [None]:
data = []
for concentration in concentrations :
    track = tracks[concentration]
    data.append(find_peak(track,peak))

In [None]:
data

In [None]:
data_q = np.array([p['q'] for p in data])
data_p = np.array([p['p'] for p in data])
data_score = np.array([p['score'] for p in data])
data_val = np.array([p['val'] for p in data])

In [None]:
plt.semilogx(concentrations,data_val)
plt.xlabel('Concentration [nM]')
plt.ylabel('Peak val')
plt.show()

In [None]:
for concentration in concentrations :
    print concentration, 'nM ',tracks[concentration].size, tracks[concentration]['score'].mean()

In [None]:
tracks[concentration]['p']

Here the conclusion is quite clear: the enrichment of the high-affinity peaks is increasing as a function of the hormone concentration. This is at odds with the predictions of the TEV model. However, chances are that I might have missed an important piece of the puzzle, which is the regime in which there are many high-affinity sites and there are fewer searchers than high-affinity sites.

So I want to go back to the problem and look at this regime. I will hypothesize that there are only two classes of sites: high-affinity and low-affinity sites. But I will simulate a much larger system than the one I did before.

# A lot of high affinity sites

Here I look at a different regime: $N$ large, a lot of high-affinity sites, and $M$ of the order of the number of high-affinity sites.

In [None]:
N = 500
phi_N = 0.1
nhigh = int(N*phi_N)
nlow = N-nhigh
phi_M = 0.01
M = int(N*phi_M)

In [None]:
site_times = np.ones(N)
high_sites = np.random.choice(N,size=nhigh,replace=False)
site_times[high_sites] = 10.0

In [None]:
T = 100000
occupancy_t = TEV_model_simulate(T,N,M,site_times)

In [None]:
occupancy = occupancy_t.sum(axis=0)
fig = plt.figure(figsize=(10,4))
ax = plt.subplot(111)
x = np.arange(N)
tav = site_times.mean()
ax.plot(x,site_times/(N*tav),color='r',label='Expected')
ax.plot(x,occupancy/(M*float(T)),color='b',label='Occupancy')
ax.legend(loc='upper left')
ax.axhline(y=1.0/N)
plt.show()

Okay so we have no effect due to the fact that the number of searchers is lower than the number of high-affinity sites. The global effect is always that of relative depletion of the occupancy of the high-affinity sites.

# Another idea

Now it's true that the enrichment of the high-quality peaks increases with increasing concentration, but this may be due to the fact that there are many cells and there are more binding events per population. Within the single cell, it is possible that the effect of relative depletion of the occupancy of the high-quality peaks is still visible. This might be evident from the ratio of the enrichment of the medium (or low) quality peaks as compared to the ratio of the enrichment of the high quality peaks when increasing the concentration of hormone. Let's go and see.

In [None]:
def do_peaks_overlap(peaks_chr,target_peak) :
    target_peak_width = target_peak['end']-target_peak['start']
    for p in peaks_chr :
        # a tiny algorithm to test whether there is an overlap
        d = max(p['end'],target_peak['end'])-min(p['start'],target_peak['start'])
        D = target_peak_width + (p['end']-p['start'])
        if d<=D :
            overlap = True
            return p
    return None

In [None]:
def do_peaks_overlap_complete(candidate_peaks,target_peak) :
    target_peak_width = target_peak['end']-target_peak['start']
    peaks_chr = np.array([p for p in candidate_peaks if p['chr']==target_peak['chr']])
    for p in peaks_chr :
        # a tiny algorithm to test whether there is an overlap
        d = max(p['end'],target_peak['end'])-min(p['start'],target_peak['start'])
        D = target_peak_width + (p['end']-p['start'])
        if d<=D :
            overlap = True
            return p
    return None

In [None]:
chromosomes = ['chr%d'%d for d in range(1,23)]
chromosomes.append('chrX')
specific_4HCP_peaks = []
for chromosome in chromosomes :
    # do the search chromosome by chromosome: first select the peaks in the 4HCP
    # population that correspond to a given chromosome (I drop the _chr in the names)
    peaks_4HCP = np.array([p for p in peaks['4HCP'] if p['chr']==chromosome])
    # select also all the peaks of that chromosome that correspond to the
    # all-treated condition.
    peaks_AT = np.array([p for p in peaks['all_treated'] if p['chr']==chromosome])
    # now I don't have any other option but test the overlap of all against all
    for peak_AT in peaks_AT :
        p = do_peaks_overlap(peaks_4HCP,peak_AT)
        if p is not None :
            specific_4HCP_peaks.append(p)

Okay I did not understand well the definitions of the populations: in the 4HCP population there are NOT the peaks that correspond to the "all_treated" conditions. Therefore I have the job simplified, because there are no overlaps to disentangle. Now let's proceed to evaluating the ratio between the enrichment of these peaks at the higher and lower concentration.

In [None]:
# I start from a randomly chosen peak from the 4HCP population: if my reasoning is correct
# this peak should be present in the 0.10 nM case, in the 0.50 nM case, but not in the 0.05 nM
# case. First let's verify that this is indeed the case.
target_peak_4HCP = peaks['4HCP'][1203]
print target_peak_4HCP
chromosome = target_peak_4HCP['chr']
for condition in [0.05,0.10,0.50] :
    track_chr = np.array([p for p in tracks[condition] if p['chr']==chromosome])
    print condition, do_peaks_overlap(track_chr,target_peak_4HCP)

Perfect, it works. I now create a table of the values of the peak scores for each class of peaks, for each condition.

In [None]:
n_concentrations = len(concentrations)
n_AT = len(peaks['all_treated'])
scores_AT = np.zeros((n_AT,n_concentrations))
target_peaks = peaks['all_treated']
for i,target_peak in enumerate(target_peaks) :
    chromosome = target_peak['chr']
    for j in xrange(n_concentrations) :
        concentration = concentrations[j]
        this_track = tracks[concentration]
        candidate_peaks = np.array([p for p in this_track if p['chr']==chromosome])
        p = do_peaks_overlap(candidate_peaks,target_peak)
        if p is None :
            print concentration,target_peak
        else :
            scores_AT[i,j] = p['score']

In [None]:
n_4HCP = len(peaks['4HCP'])
scores_4HCP = np.zeros((n_4HCP,n_concentrations))
target_peaks = peaks['4HCP']
for i,target_peak in enumerate(target_peaks) :
    chromosome = target_peak['chr']
    for j in xrange(1,n_concentrations) :
        concentration = concentrations[j]
        this_track = tracks[concentration]
        candidate_peaks = np.array([p for p in this_track if p['chr']==chromosome])
        p = do_peaks_overlap(candidate_peaks,target_peak)
        if p is None :
            print concentration,target_peak
        else :
            scores_4HCP[i,j] = p['score']

In [None]:
# fig, ax = plt.subplots(figsize=(10,10))
plt.figure(figsize=(10,15))
ax = plt.subplot(121)
ax.imshow(scores_AT,aspect='auto')
ax = plt.subplot(122)
ax.imshow(scores_4HCP,aspect='auto')
plt.show()

Now that the data is conveniently all loaded into these two arrays, we can do the comparison between the increment of the score in the 4CP case versus the all_treated case.

In [None]:
ratio_4HCP = scores_4HCP[:,2]/scores_4HCP[:,1]
ratio_AT = scores_AT[:,2]/scores_AT[:,1]

In [None]:
diff_4HCP = scores_4HCP[:,2]-scores_4HCP[:,1]
diff_AT = scores_AT[:,2]-scores_AT[:,1]

In [None]:
plt.hist(ratio_4HCP[~np.isnan(ratio_4HCP)],bins=100)
plt.title("4HCP-specific peaks: ratio")
plt.xlim(0,20)
plt.show()

In [None]:
plt.hist(ratio_AT[~np.isnan(ratio_AT)],bins=100)
plt.xlim(0,20)
plt.title("All treated: ratio")
plt.show()

In [None]:
plt.hist(diff_4HCP[~np.isnan(diff_4HCP)],bins=100)
plt.title("4HCP-specific peaks: diff")
# plt.xlim(0,20)
plt.show()

In [None]:
plt.hist(diff_AT[~np.isnan(diff_AT)],bins=100)
plt.title("all-treated: diff")
# plt.xlim(0,20)
plt.show()

In [None]:
x = np.arange(-2000,4000)
k_diff_4HCP = scipy.stats.gaussian_kde(diff_4HCP[~np.isnan(diff_4HCP)])
y_diff_4HCP = k_diff_4HCP(x)
k_diff_AT = scipy.stats.gaussian_kde(diff_AT[~np.isnan(diff_AT)])
y_diff_AT = k_diff_AT(x)

In [None]:
plt.plot(x,y_diff_4HCP,label='4HCP-specific')
plt.plot(x,y_diff_AT,label='all_treated')
plt.legend(loc='upper right')
plt.xlabel('Difference between peak values')
plt.ylabel('Distribution')
plt.show()