## Learning bi-partite motifs based on a thermodynamic approach
### Implements the dynamic programming and the gradient descent

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import itertools
import pandas as pd
from scipy.optimize import fmin_l_bfgs_b
from scipy.optimize import check_grad
from scipy.special import logsumexp
from Bio import SeqIO
import random

%load_ext cython

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

In [2]:
#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import get_directive_defaults
directive_defaults = get_directive_defaults()
directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

### cython

In [3]:
%%cython -f -I . --compile-args=-DCYTHON_TRACE=1 


cimport cython
import numpy as np
import itertools
from libc.math cimport exp,pow


cdef int l = 3 #l_A=l_B=3 nucleotides
cdef int l_p = 3 #persistence length is 3 nucleotides 
cdef double cpi = np.pi

cpdef generate_kmer_inx():
    cdef dict vals = {'A':0,'C':1,'G':2,'T':3}
    cdef dict kmer_inx = {}
    
    for p in list(itertools.product(vals.keys(), repeat=l)):
        inx = 0
        for j,base in enumerate(p):
            inx += (4**j)*vals[base] 
        kmer_inx[''.join(p)] = inx
    return kmer_inx

kmer_inx = generate_kmer_inx()
inx_kmer = {y:x for x,y in kmer_inx.items()}

cpdef seq2int_cy(str sequence):
    cdef int L = len(sequence)
    kmer_array = np.zeros(L, dtype=int)
    
    cdef i
    for i in range(l-1,L):
        kmer = sequence[i-l+1:i+1]
        kmer_array[i] = kmer_inx[kmer]
    return kmer_array        


cpdef void assign_za_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    if i == l-1:
        za[i] = cab * exp(-Ea[x[i]])
        return
    za[i] = (zb[i-l] + np.sum(za[:i-l+1])) * cab * exp(-Ea[x[i]])
    
cpdef void assign_zb_cy(long[:] x, int i, double[:] za, double[:] zb, double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    cdef double z = zb[i-1]
    cdef int j
    
    if i == l-1:
        z += cab*exp(-Eb[x[i]])  
    else:
        for j in range(0,i-l+1):
            z += za[j]*cb_c(i-j-l, sf, D, sig)*exp(-Eb[x[i]])
        z += zb[i-l]*cab*np.exp(-Eb[x[i]])      
    zb[i] = z 


cpdef void assign_za_E_derivatives_cy(long[:] x, int i, int inx, double[:] za, double[:] zb,
                                 double[:,:] za_Ea_derivatives, double[:,:] zb_Ea_derivatives, double[:,:] za_Eb_derivatives, double[:,:] zb_Eb_derivatives,
                                 double[:] Ea, double[:] Eb, double cab, double sf, double D, double sig):
    identical = (inx == x[i])
    
    if i == l-1:
        za_Ea_derivatives[inx,i] = -identical*cab*exp(-Ea[x[i]])
        za_Eb_derivatives[inx,i] = 0
        return
    
    za_Ea_derivatives[inx,i] = cab*(zb_Ea_derivatives[inx,i-l] + np.sum(za_Ea_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]]) - cab*(zb[i-l] + np.sum(za[:i-l+1]))*identical*exp(-Ea[x[i]])
    za_Eb_derivatives[inx,i] = cab*(zb_Eb_derivatives[inx,i-l]+ np.sum(za_Eb_derivatives[inx,:i-l+1]))*exp(-Ea[x[i]])


cdef extern from "assign_zb_E_derivatives.c":
    pass
    
cdef extern from "assign_zb_E_derivatives.h":
    cdef void assign_zb_E_derivatives_c(long* x, int i, int inx, double* za, double* zb, int L, int l, double l_p,
                                 double* za_Ea_derivatives, double* zb_Ea_derivatives, double* za_Eb_derivatives, double* zb_Eb_derivatives,
                                 double* Ea, double* Eb, double cab, double sf, double D , double sig)
    cdef double cb_c(int, double, double, double)
    cdef double cb_D_derivative_c(int, double, double, double);
    cdef double cb_sig_derivative_c(int, double, double, double);
    cdef double cb_sf_derivative_c(int, double, double, double);
    
    
cpdef void assign_za_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    if i == l-1:
        za_D_derivatives[i] = 0
        return
    za_D_derivatives[i] = (zb_D_derivatives[i-l]+np.sum(za_D_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])
    

cpdef void assign_za_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    if i == l-1:
        za_sig_derivatives[i] = 0
        return
    za_sig_derivatives[i] = (zb_sig_derivatives[i-l]+np.sum(za_sig_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_za_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    if i == l-1:
        za_sf_derivatives[i] = 0
        return
    za_sf_derivatives[i] = (zb_sf_derivatives[i-l]+np.sum(za_sf_derivatives[:i-l+1]))*cab*exp(-Ea[x[i]])    


cpdef void assign_zb_D_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_D_derivatives, double[:] zb_D_derivatives, double[:] Ea, double[:] Eb, 
                                     double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_D_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_D_derivative_c(i-l-j, sf, D, sig)
        der += zb_D_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_D_derivatives[i-1]
    
    zb_D_derivatives[i] = der
    
cpdef void assign_zb_sig_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sig_derivatives, double[:] zb_sig_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_sig_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sig_derivative_c(i-l-j, sf, D, sig)
        der += zb_sig_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sig_derivatives[i-1]
    
    zb_sig_derivatives[i] = der


cpdef void assign_zb_sf_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                             double[:] za_sf_derivatives, double[:] zb_sf_derivatives, double[:] Ea, double[:] Eb, 
                                      double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    if i == l-1:
        der += 0
    else:
        for j in range(0,i-l+1):
            der += za_sf_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]*cb_sf_derivative_c(i-l-j, sf, D, sig)
        der += zb_sf_derivatives[i-l]*cab
    der *= exp(-Eb[x[i]])
    der += zb_sf_derivatives[i-1]
    
    zb_sf_derivatives[i] = der

    
    
cpdef void assign_za_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    if i == l-1:
        za_cab_derivatives[i] = exp(-Ea[x[i]])
        return
    za_cab_derivatives[i] = exp(-Ea[x[i]])*((zb_cab_derivatives[i-l]+ np.sum(za_cab_derivatives[:i-l+1]))*cab + zb[i-l] + np.sum(za[:i-l+1]))
    
cpdef void assign_zb_cab_derivative_cy(long[:] x, int i, double[:] za, double[:] zb, 
                                    double[:] za_cab_derivatives, double[:] zb_cab_derivatives, double[:] Ea, double[:] Eb, 
                                       double cab, double sf, double D , double sig):
    cdef double der = 0
    cdef int j
    
    if i == l-1:
        der += 1
    else:
        for j in range(0,i-l+1):
            der += za_cab_derivatives[j]*cb_c(i-l-j, sf, D, sig) + za[j]
            der += zb_cab_derivatives[j]*cab + zb[j]
    der *= exp(-Eb[x[i]])
    der += zb_cab_derivatives[i-1]
    
    zb_cab_derivatives[i] = der
    
    
def DP_Z_cy(double[:] args, long[:] x):
    
    cdef int L = len(x)
    cdef double cab = 1.0

    cdef double[:] Ea = args[0:len(kmer_inx)]
    cdef double[:] Eb = args[len(kmer_inx):2*len(kmer_inx)]
    cdef double sf = args[-3]
    cdef double D = args[-2]
    cdef double sig = args[-1]
    
    #initialization of statistical weigths
    cdef double[:] za = np.zeros(L)
    cdef double[:] zb = np.zeros(L)

    cdef int i
    for i in range(0,l-1):
        zb[i] = 1 

    #initialization of derivatives
    cdef double[:,::1] za_Ea_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Ea_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:,::1] za_Eb_derivatives = np.zeros((len(kmer_inx),L))
    cdef double[:,::1] zb_Eb_derivatives = np.zeros((len(kmer_inx),L))

    cdef double[:] za_sf_derivatives = np.zeros(L)
    cdef double[:] zb_sf_derivatives = np.zeros(L)
    
    cdef double[:] za_D_derivatives = np.zeros(L)
    cdef double[:] zb_D_derivatives = np.zeros(L)

    cdef double[:] za_sig_derivatives = np.zeros(L)
    cdef double[:] zb_sig_derivatives = np.zeros(L)


    cdef int inx
    #dynamic programming calculation of z and derivatives 
    for i in range(l-1,L):
        #calculate statistical weights
        assign_za_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        assign_zb_cy(x, i, za, zb, Ea, Eb, cab, sf, D, sig)
        
        #calculate derivatives
        for inx in range(len(kmer_inx)):
            assign_za_E_derivatives_cy(x, i, inx, za, zb, za_Ea_derivatives, zb_Ea_derivatives, za_Eb_derivatives, zb_Eb_derivatives, Ea, Eb, cab, sf, D, sig)
            assign_zb_E_derivatives_c(&x[0], i, inx, &za[0], &zb[0], L, l, l_p, 
                                      &za_Ea_derivatives[0,0], &zb_Ea_derivatives[0,0], &za_Eb_derivatives[0,0], &zb_Eb_derivatives[0,0], 
                                      &Ea[0], &Eb[0], cab, sf, D, sig)
        
        
        assign_za_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sf_derivative_cy(x, i, za, zb, za_sf_derivatives, zb_sf_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_D_derivative_cy(x, i, za, zb, za_D_derivatives, zb_D_derivatives, Ea, Eb, cab, sf, D, sig)
        
        assign_za_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        assign_zb_sig_derivative_cy(x, i, za, zb, za_sig_derivatives, zb_sig_derivatives, Ea, Eb, cab, sf, D, sig)
        
    #print(np.asarray(za))
    #print(np.asarray(zb))
    
    Z_x = zb[L-1] + np.sum(za)
    
    #derivative of Z(x)
    d_Ea = zb_Ea_derivatives[:,L-1] + np.sum(za_Ea_derivatives, axis=1)
    d_Eb = zb_Eb_derivatives[:,L-1] + np.sum(za_Eb_derivatives, axis=1)
    
    d_sf = zb_sf_derivatives[L-1] + np.sum(za_sf_derivatives)
    d_D = zb_D_derivatives[L-1] + np.sum(za_D_derivatives)
    d_sig = zb_sig_derivatives[L-1] + np.sum(za_sig_derivatives)
    
    
    gradient = np.concatenate([q.ravel() for q in [d_Ea, d_Eb, np.array([d_sf, d_D, d_sig])]])
    
    return Z_x, gradient



### implementation of the LL object

In [4]:
class nLL:
    def __init__(self, seqs_p, seqs_bg):
        
        self.N_p = len(seqs_p)
        self.N_bg = len(seqs_bg)

        #calculate background probabilities:

        #include positive sequences in bg sequences if not there
        X_bg_t = list(set(seqs_p + seqs_bg))  #number of unique sequences
        
        counts = np.zeros(len(X_bg_t))
        for i, x in enumerate(X_bg_t):
            counts[i] = seqs_bg.count(x)
            
        counts = counts + 1 #pseudocount to make sure 
        counts = counts/np.sum(counts)

        p_bg = dict(zip(X_bg_t, counts))

        self.pbg_xp = np.array([p_bg[x] for x in seqs_p])
        self.pbg_xbg = np.array([p_bg[xbg] for xbg in seqs_bg])
        
        self.X_p = [seq2int_cy(x) for x in seqs_p]
        self.X_bg = [seq2int_cy(x) for x in seqs_bg]
        
        
    def __call__(self, args):
        
        #exp parameters to make sure they are positive
        args = np.exp(args)
    
        #implement LL and derivatives   
        z_x = np.zeros(self.N_p)
        d_z_x = np.zeros((2*len(kmer_inx)+3, self.N_p))

        z_xbg = np.zeros(self.N_bg)
        d_z_xbg = np.zeros((2*len(kmer_inx)+3, self.N_bg))


        for i, xp in enumerate(self.X_p):
            z_x[i], d_z_x[:,i] = DP_Z_cy(args, xp)

        for i, xbg in enumerate(self.X_bg):
            z_xbg[i], d_z_xbg[:,i] = DP_Z_cy(args, xbg)

        ll = np.sum(np.log(self.pbg_xp) + np.log(np.ones(self.N_p) - (np.ones(self.N_p)/z_x)))

        ll -= self.N_p * logsumexp( np.log(self.pbg_xbg) + np.log(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg)) )

        
        dll = np.sum(d_z_x/(z_x*(z_x-1)), axis=1)

        dll -= self.N_p * ( np.sum((self.pbg_xbg * d_z_xbg)/(z_xbg*z_xbg), axis=1 ) / np.sum(self.pbg_xbg*(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg))))

        #exp modify dLL
        dll = dll*args

        #regularize
        if False:
            reg = 1e-5 
            ll -= np.sum(np.power(args[:-3],2)*reg)
            dll[:-3] -= 2*reg*args[:-3]

        return -ll, -dll 

In [25]:
class nLL:
    def __init__(self, seqs_p, seqs_bg):
        
        self.N_p = len(seqs_p)
        self.N_bg = len(seqs_bg)

        #calculate background probabilities:

        #include positive sequences in bg sequences if not there
        X_bg_t = list(set(seqs_p + seqs_bg))  #number of unique sequences
        
        counts = np.zeros(len(X_bg_t))
        for i, x in enumerate(X_bg_t):
            counts[i] = seqs_bg.count(x)
            
        counts = counts + 1 #pseudocount to make sure 
        counts = counts/np.sum(counts)

        p_bg = dict(zip(X_bg_t, counts))

        self.pbg_xp = np.array([p_bg[x] for x in seqs_p])
        self.pbg_xbg = np.array([p_bg[xbg] for xbg in seqs_bg])
        
        self.X_p = [seq2int_cy(x) for x in seqs_p]
        self.X_bg = [seq2int_cy(x) for x in seqs_bg]
        
        
    def __call__(self, args):
        
        comp = 0
        #exp parameters to make sure they are positive
        args = np.exp(args)
        print(args[-10:])
    
        #implement LL and derivatives   
        z_x = np.zeros(self.N_p)
        d_z_x = np.zeros((2*len(kmer_inx)+3, self.N_p))

        z_xbg = np.zeros(self.N_bg)
        d_z_xbg = np.zeros((2*len(kmer_inx)+3, self.N_bg))


        for i, xp in enumerate(self.X_p):
            z_x[i], d_z_x[:,i] = DP_Z_cy(args, xp)
            #print(d_z_x[comp,i])
        #print('\n')
        print(z_x[:5])

        for i, xbg in enumerate(self.X_bg):
            z_xbg[i], d_z_xbg[:,i] = DP_Z_cy(args, xbg)
            #print(d_z_xbg[comp,i])
        print(z_xbg[:5])
        print('\n')
        
        ll = np.sum(np.log(self.pbg_xp) + np.log(np.ones(self.N_p) - (np.ones(self.N_p)/z_x)))
        print("LL part 1: \t%f"%ll)
        ll -= self.N_p * logsumexp( np.log(self.pbg_xbg) + np.log(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg)) )
        print("LL part 1+2: \t%f"%ll)
        
        dll = np.sum(d_z_x/(z_x*(z_x-1)), axis=1)
        print(dll[comp])
        dll -= self.N_p * ( np.sum((self.pbg_xbg * d_z_xbg)/(z_xbg*z_xbg), axis=1 ) / np.sum(self.pbg_xbg*(np.ones(self.N_bg) - (np.ones(self.N_bg)/z_xbg))))
        print(dll[comp])
        print('\n')
        #exp modify dLL
        dll = dll*args

        #regularize
        if True:
            reg = 1e-5 
            ll -= np.sum(np.power(args[:-3],2)*reg)
            dll[:-3] -= 2*reg*args[:-3]

        print("final LL: \t%f"%ll)
        print(dll[-10:])
        print("============\n\n")
        return -ll, -dll 

### Import fasta files

In [5]:
def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

In [6]:
def swap_cores (args):
    core1_args = args[:len(kmer_inx)]
    core2_args = args[len(kmer_inx):len(kmer_inx)*2]
    return np.concatenate([x.ravel() for x in [core2_args, core1_args, np.array([args[-2], args[-1]])]])

In [35]:
if False:
    set_size = 100
    bg = parse_fasta('HNRNPA0_1_TGTCGA40NCCGA_AAG_1.fasta.tmp')
    plus = parse_fasta('HNRNPA0_4_TGTCGA40NCCGA_AAG_4.fasta.tmp')

    bg = random.sample(bg, set_size)
    plus = random.sample(plus, set_size)
    
    bg   = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in bg]
    plus = [seq.replace('N', random.sample(['A','T','C','G'],1)[0]) for seq in plus]
    
else:
    bg = parse_fasta('negatives_toy.fasta')
    plus = parse_fasta('positives_toy.fasta')

In [8]:
p_array = []
def callb(x):
    print('next iteration')
    p_array.append(x)

In [33]:
Ea = np.zeros(len(kmer_inx)) + 10
Eb = np.zeros(len(kmer_inx)) + 10
sf=100
D=3
sig=3

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])

In [15]:
nll_obj = nLL(plus[:20],bg[:20])

In [11]:
#DP_Z_cy(parameters, intrep)

In [42]:
nll_obj = nLL(['TTTAAAAATTTAAAAA']*2,['AAAAAAAAAAAAAAA']*2)
nll_obj(param)

(2.7731283316962854,
 array([-3.67163195e-05, -0.00000000e+00, -0.00000000e+00, -1.00211719e-05,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -1.49189642e-05,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -5.89581615e-06, -0.00000000e+00, -0.00000000e+00, -0.00000

In [21]:
param = parameters
intrep = seq2int_cy(plus[20])

for comp in [0,127,128,129,130]:
    epsilon = 1e-5
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(param, intrep)[1][comp])

component 0
-4.759520555452922e-05
-4.7595320849452734e-05
component 127
-0.0001427860274461068
-0.0001427859757433113
component 128
6.782352457435081e-08
6.785510399252557e-08
component 129
-1.0418887974594782e-06
-1.041881587237952e-06
component 130
-2.7941537972253623e-06
-2.7943748686501242e-06


In [23]:
param = np.log(parameters)
param=x_op
for comp in [128,129,130]:
    epsilon = 1e-7
    x_fwd = param.copy()
    x_fwd[comp] += epsilon
    x_rev = param.copy()
    x_rev[comp] -= epsilon

    fx_fwd = nll_obj(x_fwd)[0]
    fx_rev = nll_obj(x_rev)[0]
    
    print('component %d\t, 1=%f, 2=%f'%(comp, fx_fwd, nll_obj(param)[0]))
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(nll_obj(param)[1][comp])

component 128	, 1=73.777589, 2=73.777589
-1.4210854715202004e-07
-3.577867169202164e-15
component 129	, 1=73.777589, 2=73.777589
1.4210854715202004e-07
1.639313684798083e-16
component 130	, 1=73.777589, 2=73.777589
6.394884621840902e-07
1.491862189340054e-16


In [13]:
intrep = seq2int_cy('TTTAAAAATTTAAAAA')

for comp in range(len(parameters)):
    epsilon = 1e-9
    x_fwd = parameters.copy()
    x_fwd[comp] += epsilon
    x_rev = parameters.copy()
    x_rev[comp] -= epsilon

    fx_fwd = DP_Z_cy(x_fwd, intrep)[0]
    fx_rev = DP_Z_cy(x_rev, intrep)[0]
    
    print('component %d'%comp)
    print((fx_fwd - fx_rev)/(2*epsilon))
    print(DP_Z_cy(parameters, intrep)[1][comp])

component 0
-187657.5406640768
-187657.78312047664
component 1
0.0
0.0
component 2
0.0
0.0
component 3
-664313.3237957954
-664312.8493563096
component 4
0.0
0.0
component 5
0.0
0.0
component 6
0.0
0.0
component 7
0.0
0.0
component 8
0.0
0.0
component 9
0.0
0.0
component 10
0.0
0.0
component 11
0.0
0.0
component 12
0.0
0.0
component 13
0.0
0.0
component 14
0.0
0.0
component 15
-2087007.7423751352
-2087008.2133899292
component 16
0.0
0.0
component 17
0.0
0.0
component 18
0.0
0.0
component 19
0.0
0.0
component 20
0.0
0.0
component 21
0.0
0.0
component 22
0.0
0.0
component 23
0.0
0.0
component 24
0.0
0.0
component 25
0.0
0.0
component 26
0.0
0.0
component 27
0.0
0.0
component 28
0.0
0.0
component 29
0.0
0.0
component 30
0.0
0.0
component 31
0.0
0.0
component 32
0.0
0.0
component 33
0.0
0.0
component 34
0.0
0.0
component 35
0.0
0.0
component 36
0.0
0.0
component 37
0.0
0.0
component 38
0.0
0.0
component 39
0.0
0.0
component 40
0.0
0.0
component 41
0.0
0.0
component 42
0.0
0.0
component 43
0

In [57]:
nll_obj = nLL(['TTTAAAAATTTAAAAA']*10,['AAAAAAAAAAAAAAAA']*10)
#x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=np.log(parameters), callback=callb)
check_grad(lambda input_vector: nll_obj(input_vector)[0], lambda input_vector: nll_obj(input_vector)[1], x0 = np.log(parameters))
#intrep = seq2int_cy('TTTAAAAATTTAAA')
#check_grad(lambda input_vector: DP_Z_cy(input_vector, intrep)[0], lambda input_vector: DP_Z_cy(input_vector, intrep)[1], x0 = parameters)

7.363357462476019e-07

In [324]:
p = 'TTGTGGTGTTAGTGTTAGTGTTAGGTTAGATCATTTGTCT'
DP_Z_cy(np.exp(param2), seq2int_cy(p))

<MemoryView of 'ndarray' object>
<MemoryView of 'ndarray' object>


(55.53573882331846,
 array([  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,  -2.35129631,
         -2.36383122,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,  -9.45685511,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
         -2.35908573,   0.        ,  -2.40397879,   0.        ,
          0.        ,   0.        ,   0.        ,  -9.45569485,
          0.        ,   0.        ,   0.        ,   0.        ,
         -2.3637359 ,   0.        ,   0.        ,  -2.38007508,
          0.        ,   0.        ,  -9.4407371 ,  -5.13057607,
          0.        ,  -2.33646005,  -2.36254966,   0.        ,
          0.        ,   0.        ,   0.        ,  -2.67120619,
         -4.72739389,   0.        ,  -4.75550881, -11.85723375,
         -2.31671574

### optimization

In [36]:
len(plus)

400

In [37]:
nll_obj = nLL(plus,bg)

In [24]:
nll_obj(np.log(parameters))

(73.77758908227871,
 array([-8.07684112e-01,  1.44589166e-01, -6.61413425e-01, -1.19240793e-02,
         4.09985417e-03, -5.38140712e-01, -2.43145650e-01,  1.05972413e+00,
        -6.64915184e-01, -7.95395538e-01,  1.38721418e-01,  5.12829173e-01,
        -2.74401466e-01, -1.74408065e+00,  1.88311793e+00,  6.61441565e-01,
        -2.86284515e-01, -9.32360501e-01, -3.92503599e-01,  1.13069895e-02,
        -1.34400677e+00, -1.34620099e-01, -9.43904318e-01, -2.63421568e-01,
        -5.38422244e-01, -1.61356090e+00,  6.73418599e-01, -7.84066564e-01,
         4.03774054e-01, -5.42251998e-01,  2.69188312e-01,  5.38343516e-01,
        -8.01763641e-01,  6.72353164e-01, -5.42256265e-01,  4.07568647e-01,
         3.99405393e-01, -5.33976697e-01, -1.06496428e+00, -1.38379780e-01,
         2.63423938e-01, -3.98307184e-01, -5.26460378e-01,  5.34318894e-01,
         3.99814475e-01,  7.94589123e-01, -1.07344813e+00,  5.78928853e-03,
         8.31280213e-01,  6.72646415e-01,  5.28199575e-01, -2.695150

In [38]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=np.log(parameters), callback=callb)

[ 10.  10.  10.  10.  10.  10.  10. 100.   3.   3.]
[1.00346086 1.00346086 1.00346086 1.00346086 1.00346086]
[1.00346086 1.00346086 1.00346086 1.00346086 1.00346086]


LL part 1: 	-5103.907402
LL part 1+2: 	-2673.844691
-2.5778970214366606
0.3947117486785494


final LL: 	-2673.972691
[ 6.61196502e-01 -1.58379712e+00  2.89162233e+00  6.04561398e+00
  5.12879095e+00 -2.11158661e+00  5.39087967e+00 -5.20417043e-16
 -2.28983499e-16  2.49800181e-16]


[ 10.05249786   9.87536142  10.23163122  10.49040121  10.41451278
   9.83417253  10.43615051 100.           3.           3.        ]
[1.0182558  1.0103033  1.01595228 1.0168566  1.01687042]
[1.00345437 1.00344854 1.00329107 1.00301309 1.01015715]


LL part 1: 	-4539.226648
LL part 1+2: 	-2401.721515
-0.5742898005468253
0.46806996341645624


final LL: 	-2401.851146
[6.34812064 6.54094112 5.27815658 5.58699381 4.99794516 6.85335814
 5.00803921 1.30044957 1.01369598 0.01716549]


[ 10.26525985   9.39214946  11.21306636  12.70458526  12.25165576
 

In [39]:
info

{'grad': array([-1.34164732e-02,  1.40233123e-02, -5.99113111e-02, -2.45344334e-03,
         2.96512833e-03,  4.16094656e-02, -7.20613377e-02, -1.17526586e-03,
        -6.52568156e+00,  3.88876719e-03,  5.31218229e-03,  3.65738965e-03,
        -7.11844986e+00, -1.81274698e+00, -9.82436203e+00, -3.57994448e+00,
        -6.59343895e-04,  4.00140513e-02, -1.19574045e-02, -1.14583111e-02,
         1.18077188e-02,  6.62347512e-02,  9.78655873e-01,  1.49539001e-02,
        -3.60505712e+00,  1.87245102e-03,  4.97381585e-03,  1.76327641e-02,
         1.17544072e-01,  1.56475811e+00, -8.13873962e-02,  2.69444286e-04,
        -1.25460197e-02, -1.08025315e-02, -8.46386555e-05, -2.61353290e+01,
        -1.59218660e-02,  1.24031272e-02, -4.30057091e-03, -1.39799985e-03,
        -6.35685233e-01, -1.25704394e-03, -4.65564637e-03,  1.48175151e-02,
        -3.94173106e-04,  1.59398826e-01, -2.69207276e-03, -3.16000154e-03,
         3.64471831e-03, -9.18812783e-03,  7.52540055e-01, -1.28640113e-01,
    

In [40]:
np.exp(x_opt)

array([ 11.70209788,  12.38906908,   9.82966539,  14.01773907,
        13.7908865 ,  11.79618128,  10.93722537,  11.75592258,
         4.56725591,  11.62098332,  11.94682177,  12.80014926,
         5.00372657,   5.40753525,   4.71325306,   5.82801118,
        14.16220622,  11.79061392,  10.23921027,  12.9510295 ,
        11.80212076,  10.73672465,   8.38370564,  11.74743422,
         5.51090008,  12.94363135,  11.19892781,  12.11585015,
        10.47896745,   8.2884837 ,  10.20641581,  17.043311  ,
        10.8494041 ,  12.02040373,  11.44764098,   1.23419992,
        12.54571747,  11.00957375,  12.4023851 ,  13.72268129,
         6.04625702,  14.01924102,  13.87513737,  11.87635756,
        13.01168688,  10.68057241,  11.01968514,  13.15164726,
        11.18848138,  11.5783669 ,   7.55778721,   9.34755845,
        10.95343058,   8.89342814,   8.52000603,   7.90490033,
         5.61642858,  10.27806847,   9.27055971,  11.21334306,
        12.74386506,  12.26560025,   9.18410066,  12.40

In [41]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

pd.Series(core1).sort_values(ascending=True)

TAG     1.234200
AGA     4.567256
GTA     4.713253
ATA     5.003727
CTA     5.407535
         ...    
GGG    13.875137
TAA    14.017739
CGG    14.019241
AAC    14.162206
TTC    17.043311
Length: 64, dtype: float64

In [42]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

TAG     1.228647
GTA     4.375790
AGA     4.848629
ATA     5.095288
CTA     5.415900
         ...    
GGG    13.871671
CGG    14.017463
TAA    14.018468
AAC    14.166579
TTC    17.040767
Length: 64, dtype: float64

In [150]:
x_opt, fx, info = fmin_l_bfgs_b(nll_obj, x0=swap_cores(x_opt), callback=callb, pgtol=1e-6)

[16.67470408 15.90506218 15.40663549 16.65659094 15.37226262 17.57691134
 18.71241176 15.26813967  0.1109494  12.18249396]
[1.00049347 1.00158774 1.00191142 1.00104375 1.0011884 ]
[1.00036705 1.00108433 1.0007672  1.000221   1.00044074]
LL part 1: 	-2692.932507
LL part 1+2: 	-1157.083439
final LL: 	-1157.817550
[ 3.14916388e-02  1.14712603e-01  3.86449691e-01 -5.30209290e-02
 -5.47356707e-01 -9.00305482e-02 -8.20593600e-04 -2.97083249e+00
 -1.16208575e-02  0.00000000e+00]


[16.67664332 15.91180111 15.42863752 16.65333    15.34122263 17.57106865
 18.71235506 15.10155447  0.11094464 12.18249396]
[1.00367846 1.03367406 1.04881998 1.04999608 1.03876285]
[1.00031698 1.04702534 1.02999541 1.00019554 1.01977034]
LL part 1: 	-2124.225046
LL part 1+2: 	-1257.274054
final LL: 	-1258.009818
[-0.44872529 -1.10902925 -2.03071211 -0.41545092 -1.93731208 -0.28037227
 -0.06505667 -4.48391213  1.17257189  0.        ]


[16.67513866 15.90657217 15.41156353 16.65586009 15.36530093 17.57560179
 18.712399

In [151]:
info

{'grad': array([ 4.25035451e-04, -6.66331167e-03,  5.91623252e-04,  3.84238633e-04,
         4.23758515e-04,  4.24073351e-05,  3.86692959e-04,  3.87300639e-04,
         4.04712869e-04,  4.05248002e-04,  1.23800643e-03,  1.35259609e-03,
         4.83486950e-04,  4.60462314e-04,  3.38799483e-04,  8.36429012e-04,
         4.48109885e-04,  4.63599984e-04,  5.16712942e-04,  4.29845175e-04,
         2.59687636e-04, -4.21601759e-04,  4.33498156e-04,  2.64493714e-04,
         2.97510054e-04,  3.68259523e-04,  4.48981639e-04,  8.28952190e-04,
         3.25727334e-04,  3.13447837e-04,  4.34578312e-04,  3.61050889e-04,
         2.92269055e-04,  4.24710325e-04,  5.82334889e-04,  7.20973130e-04,
         3.65241616e-04,  3.78229397e-04,  5.87795164e-04,  4.93249480e-04,
         4.72402124e-04,  4.07938996e-04,  4.21739776e-04,  7.16801648e-04,
        -1.63260416e-01,  3.36408903e-04,  1.92325791e-01,  3.22293958e-04,
         5.28064355e-04,  1.03602740e-03,  1.14848670e-03,  1.87847456e-03,
    

In [13]:
np.exp(x_opt[-4:])

array([4.77134003e+00, 7.31678021e+03, 5.98350414e+00, 1.82418325e-02])

In [14]:
core1 = {}
for i in range(len(kmer_inx)):
    core1[inx_kmer[i]] = np.exp(x_opt)[i]

pd.Series(core1).sort_values(ascending=True)

TTT      5.660539
AAA      8.399421
AGA      9.168449
CTA      9.809132
ATA     10.137845
          ...    
TCT     25.512313
GAT     49.844746
CTT    147.737640
TTG    304.650022
GTT    775.787288
Length: 64, dtype: float64

In [15]:
core2 = {}
for i in range(len(kmer_inx)):
    core2[inx_kmer[i]] = np.exp(x_opt)[i+64]
pd.Series(core2).sort_values(ascending=True)

TTT       4.771340
GCG       6.881357
CGC       8.018314
AAA       9.203048
ATA       9.421745
          ...     
TCT      25.495074
GAT      50.030763
CTT     155.782047
TTG     210.852680
GTT    1408.126797
Length: 64, dtype: float64

### debug mode

In [18]:
def set_kmer_arg (args, kmer, domain, value):
    inx = kmer_inx[kmer]
    inx = inx + (domain-1)*len(kmer_inx)
    args_m = args
    args_m[inx] = value
    return args_m

In [38]:
Ea = np.zeros(len(kmer_inx)) + 8
Eb = np.zeros(len(kmer_inx)) + 8
sf=10000
D=3
sig=1

parameters = np.concatenate([x.ravel() for x in [Ea, Eb, np.array([sf, D, sig])]])


parameters = set_kmer_arg(parameters, 'TTT', 1, 0.1)
parameters = set_kmer_arg(parameters, 'TTT', 2, 0.1)
#parameters = set_kmer_arg(parameters, 'AAA', 1, 3)
#parameters = set_kmer_arg(parameters, 'AAA', 2, 3)

100