In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mybiotools as mbt
import os
import pr_peaks
from scipy.stats import gaussian_kde
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.motifs.matrix import PositionWeightMatrix

In [None]:
# load the ChIP-seq data
conditions = [
    pr_peaks.Condition('high'   ,'all_treated',0.05,'gv_107_01_01_chipseq'),
    pr_peaks.Condition('medium1','4HCP'       ,0.10,'gv_108_01_01_chipseq'),
    pr_peaks.Condition('medium2','3HCP'       ,0.50,'gv_109_01_01_chipseq'),
    pr_peaks.Condition('low'    ,'1HCP'       ,10.0,'gv_111_01_01_chipseq')
]

# 2018-05-04 Palindromes?
The idea that cooperativity is the key effect that should explain h-enhancement seems to resist to the attacks from multiple sides. Now, I can think of two distinct ways that cooperativity (that is, stabilization of the protein-DNA contact because of increased stability due to protein-protein interaction) can be implemented.

- Case 1: **DNA loop-mediated tetramerization**. In this case, two PR molecules should bind to distal portions of DNA, and the result of the interaction creates a tetramer and results in looping of the DNA.
- Case 2: **Binding of the PR to palindromic sequences**. The consensus DNA sequence of the PR is a palindrome: this means that one possibility is that tandem repeats of the consensus sequence could play a role in helping a cis-mediated interaction which would result in cooperative effects.

To distinguish between these two hypotheses, we need to get the sequences that correspond to the H peaks and the L peaks and compare them.

In [None]:
hg19_genome_file = os.getenv('HOME') + '/work/data/GRCh37.fasta'
h19 = SeqIO.index (hg19_genome_file,'fasta',alphabet=IUPAC.unambiguous_dna)

In [None]:
# load the PR binding motif matrix so that Bio understands it
M = np.genfromtxt(os.getenv('HOME')+'/work/data/motif231.motif',comments='>')
Mdict = {}
for i,letter in enumerate(['A','C','G','T']) :
    Mdict[letter] = M[:,i]
pwm = PositionWeightMatrix(IUPAC.unambiguous_dna,Mdict)
pssm = pwm.log_odds()
motif_length = len(pwm['A'])

In [None]:
fig = mbt.sequence_logo(pwm)

Let's do some analysis on the DNA reads at the H peaks.

In [None]:
high = conditions[0]
chromosome,start,end = high.peaks[0]
seq = h19[chromosome].seq[start:end]
print seq

In [None]:
N = pwm.length
i_f_max = pssm.calculate(seq).argmax()
i_b_max = pssm.reverse_complement().calculate(seq).argmax()
print seq[i_f_max-N:i_f_max+N]
print seq[i_b_max-N:i_b_max+N]

In [None]:
chr1.seq[]